feat: Sistema CBCFacil completo con cola secuencial

- Implementa ProcessingMonitor singleton para procesamiento secuencial de archivos - Agrega AI summary service con soporte para MiniMax API - Agrega PDF generator para resúmenes - Agrega watchers para monitoreo de carpeta remota - Mejora sistema de notificaciones Telegram - Implementa gestión de VRAM para GPU - Configuración mediante variables de entorno (sin hardcoded secrets) - .env y transcriptions/ agregados a .gitignore Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 15:35:39 +00:00
parent dcf887c510
commit ee8fc183be
77 changed files with 3734 additions and 20263 deletions
--- a/services/vram_manager.py
+++ b/services/vram_manager.py
@@ -1,172 +1,307 @@
 """
-VRAM/GPU memory management service
+Gestor de VRAM para descargar modelos de ML inactivos.
+
+Proporciona limpieza automática de modelos (como Whisper) que no han sido
+usados durante un tiempo configurable para liberar memoria VRAM.
+
+OPTIMIZACIONES:
+    - Integración con cache global de modelos
+    - Limpieza agresiva de cache CUDA
+    - Monitoreo de memoria en tiempo real
 """
 import gc
 import logging
-import os
 import time
-from datetime import datetime, timedelta
-from typing import Optional, Dict, Any
-from core import BaseService
-from config import settings
+from typing import Callable, Dict, Optional

-try:
-    import torch
-    TORCH_AVAILABLE = True
-except ImportError:
-    TORCH_AVAILABLE = False
+from config.settings import settings

-# Import gpu_detector after torch check
-from .gpu_detector import gpu_detector, GPUType
+logger = logging.getLogger(__name__)


-class VRAMManager(BaseService):
-    """Service for managing GPU VRAM usage"""
+def get_gpu_memory_mb() -> Dict[str, float]:
+    """
+    Obtiene uso de memoria GPU en MB.

-    def __init__(self):
-        super().__init__("VRAMManager")
-        self._whisper_model = None
-        self._ocr_models = None
-        self._trocr_models = None
-        self._models_last_used: Optional[datetime] = None
-        self._cleanup_threshold = 0.7
-        self._cleanup_interval = 300
-        self._last_cleanup: Optional[datetime] = None
+    Returns:
+        Dict con 'total', 'used', 'free' en MB.
+    """
+    try:
+        import torch

-    def initialize(self) -> None:
-        """Initialize VRAM manager"""
-        # Initialize GPU detector first
-        gpu_detector.initialize()
-        
-        if not TORCH_AVAILABLE:
-            self.logger.warning("PyTorch not available - VRAM management disabled")
-            return
+        if torch.cuda.is_available():
+            props = torch.cuda.get_device_properties(0)
+            total = props.total_memory / (1024 ** 2)
+            allocated = torch.cuda.memory_allocated(0) / (1024 ** 2)
+            reserved = torch.cuda.memory_reserved(0) / (1024 ** 2)

-        if gpu_detector.is_available():
-            gpu_type = gpu_detector.gpu_type
-            device_name = gpu_detector.get_device_name()
-            
-            if gpu_type == GPUType.AMD:
-                self.logger.info(f"VRAM Manager initialized with AMD ROCm: {device_name}")
-            elif gpu_type == GPUType.NVIDIA:
-                os.environ['CUDA_VISIBLE_DEVICES'] = settings.CUDA_VISIBLE_DEVICES
-                if settings.PYTORCH_CUDA_ALLOC_CONF:
-                    torch.backends.cuda.max_split_size_mb = int(settings.PYTORCH_CUDA_ALLOC_CONF.split(':')[1])
-                self.logger.info(f"VRAM Manager initialized with NVIDIA CUDA: {device_name}")
-        else:
-            self.logger.warning("No GPU available - GPU acceleration disabled")
+            return {
+                "total": total,
+                "used": allocated,
+                "free": total - reserved,
+                "reserved": reserved,
+            }
+    except ImportError:
+        pass
+    except Exception as e:
+        logger.debug(f"Error obteniendo memoria GPU: {e}")

-    def cleanup(self) -> None:
-        """Cleanup all GPU models"""
-        if not TORCH_AVAILABLE or not torch.cuda.is_available():
-            return
+    return {"total": 0, "used": 0, "free": 0, "reserved": 0}

-        models_freed = []

-        if self._whisper_model is not None:
-            try:
-                del self._whisper_model
-                self._whisper_model = None
-                models_freed.append("Whisper")
-            except Exception as e:
-                self.logger.error(f"Error freeing Whisper VRAM: {e}")
+def clear_cuda_cache(aggressive: bool = False) -> None:
+    """
+    Limpia el cache de CUDA.

-        if self._ocr_models is not None:
-            try:
-                self._ocr_models = None
-                models_freed.append("OCR")
-            except Exception as e:
-                self.logger.error(f"Error freeing OCR VRAM: {e}")
+    Args:
+        aggressive: Si True, ejecuta gc.collect() múltiples veces.
+    """
+    try:
+        import torch

-        if self._trocr_models is not None:
-            try:
-                if isinstance(self._trocr_models, dict):
-                    model = self._trocr_models.get('model')
-                    if model is not None:
-                        model.to('cpu')
-                        models_freed.append("TrOCR")
-                torch.cuda.empty_cache()
-            except Exception as e:
-                self.logger.error(f"Error freeing TrOCR VRAM: {e}")
-
-        self._whisper_model = None
-        self._ocr_models = None
-        self._trocr_models = None
-        self._models_last_used = None
-
-        if models_freed:
-            self.logger.info(f"Freed VRAM for models: {', '.join(models_freed)}")
-
-        self._force_aggressive_cleanup()
-
-    def update_usage(self) -> None:
-        """Update usage timestamp"""
-        self._models_last_used = datetime.utcnow()
-        self.logger.debug(f"VRAM usage timestamp updated")
-
-    def should_cleanup(self) -> bool:
-        """Check if cleanup should be performed"""
-        if not TORCH_AVAILABLE or not torch.cuda.is_available():
-            return False
-        if self._last_cleanup is None:
-            return True
-        if (datetime.utcnow() - self._last_cleanup).total_seconds() < self._cleanup_interval:
-            return False
-        allocated = torch.cuda.memory_allocated(0)
-        total = torch.cuda.get_device_properties(0).total_memory
-        return allocated / total > self._cleanup_threshold
-
-    def lazy_cleanup(self) -> None:
-        """Perform cleanup if needed"""
-        if self.should_cleanup():
-            self.cleanup()
-            self._last_cleanup = datetime.utcnow()
-
-    def _force_aggressive_cleanup(self) -> None:
-        """Force aggressive VRAM cleanup"""
-        if not TORCH_AVAILABLE or not torch.cuda.is_available():
-            return
-        try:
-            before_allocated = torch.cuda.memory_allocated(0) / 1024**3
-            before_reserved = torch.cuda.memory_reserved(0) / 1024**3
-            self.logger.debug(f"Before cleanup - Allocated: {before_allocated:.2f}GB, Reserved: {before_reserved:.2f}GB")
-            gc.collect(0)
+        if torch.cuda.is_available():
            torch.cuda.empty_cache()
-            after_allocated = torch.cuda.memory_allocated(0) / 1024**3
-            after_reserved = torch.cuda.memory_reserved(0) / 1024**3
-            self.logger.debug(f"After cleanup - Allocated: {after_allocated:.2f}GB, Reserved: {after_reserved:.2f}GB")
-            if after_reserved < before_reserved:
-                self.logger.info(f"VRAM freed: {(before_reserved - after_reserved):.2f}GB")
+
+            if aggressive:
+                for _ in range(3):
+                    gc.collect()
+                    torch.cuda.empty_cache()
+
+            logger.debug(
+                "CUDA cache limpiada",
+                extra={"aggressive": aggressive, "memory_mb": get_gpu_memory_mb()},
+            )
+    except ImportError:
+        pass
+
+
+class VRAMManager:
+    """
+    Gestor singleton para administrar la descarga automática de modelos.
+
+    Mantiene registro del último uso de cada modelo y proporciona métodos
+    para verificar y limpiar modelos inactivos.
+
+    NOTA: Con el nuevo cache global de modelos, este gestor ya no fuerza
+    la descarga del modelo en sí, solo coordina los tiempos de cleanup.
+    """
+
+    _instance: Optional["VRAMManager"] = None
+
+    def __new__(cls) -> "VRAMManager":
+        """Implementación del patrón Singleton."""
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+
+    def __init__(self) -> None:
+        """Inicializa el gestor si no ha sido inicializado."""
+        if self._initialized:
+            return
+
+        self._last_usage: Dict[str, float] = {}
+        self._unload_callbacks: Dict[str, Callable[[], None]] = {}
+        self._auto_unload_seconds = settings.WHISPER_AUTO_UNLOAD_SECONDS
+        self._initialized = True
+
+        logger.info(
+            "VRAMManager inicializado",
+            extra={"auto_unload_seconds": self._auto_unload_seconds},
+        )
+
+    def register_model(
+        self, model_id: str, unload_callback: Callable[[], None]
+    ) -> None:
+        """
+        Registra un modelo con su callback de descarga.
+
+        Args:
+            model_id: Identificador único del modelo.
+            unload_callback: Función a llamar para descargar el modelo.
+        """
+        self._unload_callbacks[model_id] = unload_callback
+        self._last_usage[model_id] = time.time()
+
+        logger.debug(
+            "Modelo registrado en VRAMManager",
+            extra={"model_id": model_id},
+        )
+
+    def update_usage(self, model_id: str) -> None:
+        """
+        Actualiza el timestamp del último uso del modelo.
+
+        Args:
+            model_id: Identificador del modelo.
+        """
+        self._last_usage[model_id] = time.time()
+
+        logger.debug(
+            "Uso actualizado",
+            extra={"model_id": model_id, "memory_mb": get_gpu_memory_mb()},
+        )
+
+    def mark_used(self, model_id: str = "default") -> None:
+        """
+        Marca el modelo como usado (alias simple para update_usage).
+
+        Args:
+            model_id: Identificador del modelo. Default: "default".
+        """
+        self.update_usage(model_id)
+
+    def check_and_cleanup(
+        self, model_id: str, timeout_seconds: Optional[int] = None
+    ) -> bool:
+        """
+        Verifica si el modelo debe ser descargado y lo limpia si es necesario.
+
+        NOTA: Con el cache global, la descarga solo elimina la referencia
+        local. El modelo puede permanecer en cache para otras instancias.
+
+        Args:
+            model_id: Identificador del modelo a verificar.
+            timeout_seconds: Tiempo máximo de inactividad en segundos.
+
+        Returns:
+            True si el modelo fue descargado, False si no necesitaba descarga.
+        """
+        if model_id not in self._unload_callbacks:
+            logger.warning(
+                "Modelo no registrado en VRAMManager",
+                extra={"model_id": model_id},
+            )
+            return False
+
+        threshold = timeout_seconds or self._auto_unload_seconds
+        last_used = self._last_usage.get(model_id, 0)
+        elapsed = time.time() - last_used
+
+        logger.debug(
+            "Verificando modelo",
+            extra={
+                "model_id": model_id,
+                "elapsed_seconds": elapsed,
+                "threshold_seconds": threshold,
+            },
+        )
+
+        if elapsed >= threshold:
+            return self._unload_model(model_id)
+
+        return False
+
+    def _unload_model(self, model_id: str) -> bool:
+        """
+        Descarga el modelo invocando su callback.
+
+        Args:
+            model_id: Identificador del modelo a descargar.
+
+        Returns:
+            True si la descarga fue exitosa.
+        """
+        callback = self._unload_callbacks.get(model_id)
+        if callback is None:
+            return False
+
+        try:
+            callback()
+
+            # Limpiar cache de CUDA después de descargar
+            clear_cuda_cache(aggressive=True)
+
+            # Limpiar registro después de descarga exitosa
+            self._unload_callbacks.pop(model_id, None)
+            self._last_usage.pop(model_id, None)
+
+            logger.info(
+                "Modelo descargado por VRAMManager",
+                extra={
+                    "model_id": model_id,
+                    "reason": "inactive",
+                    "memory_mb_after": get_gpu_memory_mb(),
+                },
+            )
+            return True
+
        except Exception as e:
-            self.logger.error(f"Error in aggressive VRAM cleanup: {e}")
+            logger.error(
+                "Error al descargar modelo",
+                extra={"model_id": model_id, "error": str(e)},
+            )
+            return False

-    def get_usage(self) -> Dict[str, Any]:
-        """Get VRAM usage information"""
-        if not TORCH_AVAILABLE:
-            return {'error': 'PyTorch not available'}
-        if not torch.cuda.is_available():
-            return {'error': 'CUDA not available'}
-        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
-        allocated = torch.cuda.memory_allocated(0) / 1024**3
-        cached = torch.cuda.memory_reserved(0) / 1024**3
-        free = total - allocated
-        return {
-            'total_gb': round(total, 2),
-            'allocated_gb': round(allocated, 2),
-            'cached_gb': round(cached, 2),
-            'free_gb': round(free, 2),
-            'whisper_loaded': self._whisper_model is not None,
-            'ocr_models_loaded': self._ocr_models is not None,
-            'trocr_models_loaded': self._trocr_models is not None,
-            'last_used': self._models_last_used.isoformat() if self._models_last_used else None,
-            'timeout_seconds': settings.MODEL_TIMEOUT_SECONDS
-        }
+    def force_unload(self, model_id: str) -> bool:
+        """
+        Fuerza la descarga inmediata de un modelo.

-    def force_free(self) -> str:
-        """Force immediate VRAM free"""
-        self.cleanup()
-        return "VRAM freed successfully"
+        Args:
+            model_id: Identificador del modelo a descargar.
+
+        Returns:
+            True si la descarga fue exitosa.
+        """
+        return self._unload_model(model_id)
+
+    def get_memory_info(self) -> Dict[str, float]:
+        """
+        Obtiene información actual de memoria GPU.
+
+        Returns:
+            Dict con 'total', 'used', 'free', 'reserved' en MB.
+        """
+        return get_gpu_memory_mb()
+
+    def get_last_usage(self, model_id: str) -> Optional[float]:
+        """
+        Obtiene el timestamp del último uso del modelo.
+
+        Args:
+            model_id: Identificador del modelo.
+
+        Returns:
+            Timestamp del último uso o None si no existe.
+        """
+        return self._last_usage.get(model_id)
+
+    def get_seconds_since_last_use(self, model_id: str) -> Optional[float]:
+        """
+        Obtiene los segundos transcurridos desde el último uso.
+
+        Args:
+            model_id: Identificador del modelo.
+
+        Returns:
+            Segundos transcurridos o None si no existe.
+        """
+        last_used = self._last_usage.get(model_id)
+        if last_used is None:
+            return None
+        return time.time() - last_used
+
+    def unregister_model(self, model_id: str) -> None:
+        """
+        Elimina el registro de un modelo.
+
+        Args:
+            model_id: Identificador del modelo a eliminar.
+        """
+        self._unload_callbacks.pop(model_id, None)
+        self._last_usage.pop(model_id, None)
+
+        logger.debug(
+            "Modelo eliminado de VRAMManager",
+            extra={"model_id": model_id},
+        )
+
+    def clear_all(self) -> None:
+        """Limpia todos los registros del gestor."""
+        self._unload_callbacks.clear()
+        self._last_usage.clear()
+        logger.info("VRAMManager limpiado")


-# Global instance
+# Instancia global singleton
 vram_manager = VRAMManager()