feat: Sistema CBCFacil completo con cola secuencial

- Implementa ProcessingMonitor singleton para procesamiento secuencial de archivos - Agrega AI summary service con soporte para MiniMax API - Agrega PDF generator para resúmenes - Agrega watchers para monitoreo de carpeta remota - Mejora sistema de notificaciones Telegram - Implementa gestión de VRAM para GPU - Configuración mediante variables de entorno (sin hardcoded secrets) - .env y transcriptions/ agregados a .gitignore Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 15:35:39 +00:00
parent dcf887c510
commit ee8fc183be
77 changed files with 3734 additions and 20263 deletions
--- a/config/settings.py
+++ b/config/settings.py
@@ -1,261 +1,78 @@
 """
-Centralized configuration management for CBCFacil
+Configuración centralizada de la aplicación.
+Carga variables de entorno desde archivo .env
 """
-
 import os
 from pathlib import Path
-from typing import Optional, Set, Union
+from typing import Optional
 from dotenv import load_dotenv

-# Load environment variables from .env file
+# Cargar variables de entorno
 load_dotenv()


-class ConfigurationError(Exception):
-    """Raised when configuration is invalid"""
-
-    pass
-
-
 class Settings:
-    """Application settings loaded from environment variables"""
+    """Configuración centralizada de la aplicación."""

-    # Application
-    APP_NAME: str = "CBCFacil"
-    APP_VERSION: str = "8.0"
-    DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
-
-    # Nextcloud/WebDAV Configuration
+    # Nextcloud/WebDAV
    NEXTCLOUD_URL: str = os.getenv("NEXTCLOUD_URL", "")
    NEXTCLOUD_USER: str = os.getenv("NEXTCLOUD_USER", "")
    NEXTCLOUD_PASSWORD: str = os.getenv("NEXTCLOUD_PASSWORD", "")
-    WEBDAV_ENDPOINT: str = NEXTCLOUD_URL

-    # Remote folders
-    REMOTE_AUDIOS_FOLDER: str = "Audios"
-    REMOTE_DOCX_AUDIO_FOLDER: str = "Documentos"
-    REMOTE_PDF_FOLDER: str = "Pdf"
-    REMOTE_TXT_FOLDER: str = "Textos"
-    RESUMENES_FOLDER: str = "Resumenes"
-    DOCX_FOLDER: str = "Documentos"
-
-    # Local paths
-    BASE_DIR: Path = Path(__file__).resolve().parent.parent
-    LOCAL_STATE_DIR: str = os.getenv("LOCAL_STATE_DIR", str(BASE_DIR))
-    LOCAL_DOWNLOADS_PATH: Path = BASE_DIR / "downloads"
-    LOCAL_RESUMENES: Path = LOCAL_DOWNLOADS_PATH
-    LOCAL_DOCX: Path = BASE_DIR / "resumenes_docx"
-
-    # Processing
-    POLL_INTERVAL: int = int(os.getenv("POLL_INTERVAL", "5"))
-    HTTP_TIMEOUT: int = int(os.getenv("HTTP_TIMEOUT", "30"))
-    WEBDAV_MAX_RETRIES: int = int(os.getenv("WEBDAV_MAX_RETRIES", "3"))
-    DOWNLOAD_CHUNK_SIZE: int = int(
-        os.getenv("DOWNLOAD_CHUNK_SIZE", "65536")
-    )  # 64KB for better performance
-    MAX_FILENAME_LENGTH: int = int(os.getenv("MAX_FILENAME_LENGTH", "80"))
-    MAX_FILENAME_BASE_LENGTH: int = int(os.getenv("MAX_FILENAME_BASE_LENGTH", "40"))
-    MAX_FILENAME_TOPICS_LENGTH: int = int(os.getenv("MAX_FILENAME_TOPICS_LENGTH", "20"))
-
-    # File extensions
-    AUDIO_EXTENSIONS: Set[str] = {".mp3", ".wav", ".m4a", ".ogg", ".aac"}
-    PDF_EXTENSIONS: Set[str] = {".pdf"}
-    TXT_EXTENSIONS: Set[str] = {".txt"}
-
-    # AI Providers
-    ZAI_BASE_URL: str = os.getenv("ZAI_BASE_URL", "https://api.z.ai/api/anthropic")
-    ZAI_DEFAULT_MODEL: str = os.getenv("ZAI_MODEL", "glm-4.6")
-    ZAI_AUTH_TOKEN: Optional[str] = os.getenv("ANTHROPIC_AUTH_TOKEN") or os.getenv(
-        "ZAI_AUTH_TOKEN", ""
-    )
-
-    # Notion Integration
-    NOTION_API_TOKEN: Optional[str] = os.getenv("NOTION_API")
-    NOTION_DATABASE_ID: Optional[str] = os.getenv("NOTION_DATABASE_ID")
-
-    # Gemini
-    GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
-    GEMINI_FLASH_MODEL: str = os.getenv("GEMINI_FLASH_MODEL", "gemini-2.5-flash")
-    GEMINI_PRO_MODEL: str = os.getenv("GEMINI_PRO_MODEL", "gemini-1.5-pro")
-
-    # CLI paths
-    GEMINI_CLI_PATH: Optional[str] = os.getenv("GEMINI_CLI_PATH")
-    CLAUDE_CLI_PATH: Optional[str] = os.getenv("CLAUDE_CLI_PATH")
-
-    # Telegram
+    # Telegram (opcional)
    TELEGRAM_TOKEN: Optional[str] = os.getenv("TELEGRAM_TOKEN")
    TELEGRAM_CHAT_ID: Optional[str] = os.getenv("TELEGRAM_CHAT_ID")

-    # PDF Processing Configuration
-    CPU_COUNT: int = os.cpu_count() or 1
-    PDF_MAX_PAGES_PER_CHUNK: int = int(os.getenv("PDF_MAX_PAGES_PER_CHUNK", "2"))
-    PDF_DPI: int = int(os.getenv("PDF_DPI", "200"))
-    PDF_RENDER_THREAD_COUNT: int = int(
-        os.getenv("PDF_RENDER_THREAD_COUNT", str(min(4, CPU_COUNT)))
-    )
-    PDF_BATCH_SIZE: int = int(os.getenv("PDF_BATCH_SIZE", "2"))
-    PDF_TROCR_MAX_BATCH: int = int(
-        os.getenv("PDF_TROCR_MAX_BATCH", str(PDF_BATCH_SIZE))
-    )
-    PDF_TESSERACT_THREADS: int = int(
-        os.getenv("PDF_TESSERACT_THREADS", str(max(1, min(2, max(1, CPU_COUNT // 3)))))
-    )
-    PDF_PREPROCESS_THREADS: int = int(
-        os.getenv("PDF_PREPROCESS_THREADS", str(PDF_TESSERACT_THREADS))
-    )
-    PDF_TEXT_DETECTION_MIN_RATIO: float = float(
-        os.getenv("PDF_TEXT_DETECTION_MIN_RATIO", "0.6")
-    )
-    PDF_TEXT_DETECTION_MIN_AVG_CHARS: int = int(
-        os.getenv("PDF_TEXT_DETECTION_MIN_AVG_CHARS", "120")
-    )
+    # AI Providers (opcional)
+    GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
+    DEEPINFRA_API_KEY: Optional[str] = os.getenv("DEEPINFRA_API_KEY")
+    ANTHROPIC_AUTH_TOKEN: Optional[str] = os.getenv("ANTHROPIC_AUTH_TOKEN")
+    ANTHROPIC_BASE_URL: Optional[str] = os.getenv("ANTHROPIC_BASE_URL")
+    ANTHROPIC_MODEL: str = os.getenv("ANTHROPIC_MODEL", "glm-4.7")

-    # Error handling
-    ERROR_THROTTLE_SECONDS: int = int(os.getenv("ERROR_THROTTLE_SECONDS", "600"))
-
-    # GPU/VRAM Management
-    MODEL_TIMEOUT_SECONDS: int = int(os.getenv("MODEL_TIMEOUT_SECONDS", "300"))
-    CUDA_VISIBLE_DEVICES: str = os.getenv("CUDA_VISIBLE_DEVICES", "all")
-    PYTORCH_CUDA_ALLOC_CONF: str = os.getenv(
-        "PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:512"
-    )
-
-    # GPU Detection (auto, nvidia, amd, cpu)
-    GPU_PREFERENCE: str = os.getenv("GPU_PREFERENCE", "auto")
-    # AMD ROCm HSA override for RX 6000 series (gfx1030)
-    HSA_OVERRIDE_GFX_VERSION: str = os.getenv("HSA_OVERRIDE_GFX_VERSION", "10.3.0")
+    # Notion (opcional)
+    NOTION_API: Optional[str] = os.getenv("NOTION_API")
+    NOTION_DATABASE_ID: Optional[str] = os.getenv("NOTION_DATABASE_ID")

    # Dashboard
-    DASHBOARD_SECRET_KEY: str = os.getenv("DASHBOARD_SECRET_KEY", "")
-    DASHBOARD_PORT: int = int(os.getenv("DASHBOARD_PORT", "5000"))
    DASHBOARD_HOST: str = os.getenv("DASHBOARD_HOST", "0.0.0.0")
+    DASHBOARD_PORT: int = int(os.getenv("DASHBOARD_PORT", "5000"))
+    DASHBOARD_SECRET_KEY: Optional[str] = os.getenv("DASHBOARD_SECRET_KEY")
+
+    # Rutas locales
+    BASE_DIR: Path = Path(__file__).parent.parent
+    DOWNLOADS_DIR: Path = BASE_DIR / "downloads"
+    TRANSCRIPTIONS_DIR: Path = BASE_DIR / "transcriptions"
+
+    # Whisper
+    WHISPER_MODEL: str = os.getenv("WHISPER_MODEL", "medium")
+    WHISPER_DEVICE: str = os.getenv("WHISPER_DEVICE", "auto")  # auto, cuda, cpu
+    WHISPER_LANGUAGE: str = os.getenv("WHISPER_LANGUAGE", "es")
+    WHISPER_AUTO_UNLOAD_SECONDS: int = int(os.getenv("WHISPER_AUTO_UNLOAD_SECONDS", "300"))  # 5 minutos
+
+    # Configuración de polling
+    POLL_INTERVAL: int = int(os.getenv("POLL_INTERVAL", "30"))  # segundos
+    WATCHED_REMOTE_PATH: str = os.getenv("WATCHED_REMOTE_PATH", "/Audios")

    # Logging
    LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
    LOG_FILE: Optional[str] = os.getenv("LOG_FILE")

-    # Threading optimization
-    OMP_NUM_THREADS: int = int(os.getenv("OMP_NUM_THREADS", "4"))
-    MKL_NUM_THREADS: int = int(os.getenv("MKL_NUM_THREADS", "4"))
-
-    # Parallel Processing Configuration
-    MAX_PARALLEL_UPLOADS: int = int(os.getenv("MAX_PARALLEL_UPLOADS", "4"))
-    MAX_PARALLEL_AI_REQUESTS: int = int(os.getenv("MAX_PARALLEL_AI_REQUESTS", "3"))
-    MAX_PARALLEL_PROCESSING: int = int(os.getenv("MAX_PARALLEL_PROCESSING", "2"))
-    PARALLEL_AI_STRATEGY: str = os.getenv("PARALLEL_AI_STRATEGY", "race")  # race, consensus, majority
-    BACKGROUND_NOTION_UPLOADS: bool = os.getenv("BACKGROUND_NOTION_UPLOADS", "true").lower() == "true"
-
-    # ========================================================================
-    # PROPERTIES WITH VALIDATION
-    # ========================================================================
-
-    @property
-    def is_production(self) -> bool:
-        """Check if running in production mode"""
-        return not self.DEBUG
-
    @property
    def has_webdav_config(self) -> bool:
-        """Check if WebDAV credentials are configured"""
-        return all([self.NEXTCLOUD_URL, self.NEXTCLOUD_USER, self.NEXTCLOUD_PASSWORD])
+        """Verifica si hay configuración de WebDAV."""
+        return bool(self.NEXTCLOUD_URL and self.NEXTCLOUD_USER and self.NEXTCLOUD_PASSWORD)

    @property
-    def has_ai_config(self) -> bool:
-        """Check if AI providers are configured"""
-        return any(
-            [
-                self.ZAI_AUTH_TOKEN,
-                self.GEMINI_API_KEY,
-                self.CLAUDE_CLI_PATH,
-                self.GEMINI_CLI_PATH,
-            ]
-        )
-
-    @property
-    def has_notion_config(self) -> bool:
-        """Check if Notion is configured"""
-        return bool(self.NOTION_API_TOKEN and self.NOTION_DATABASE_ID)
-
-    @property
-    def processed_files_path(self) -> Path:
-        """Get the path to the processed files registry"""
-        return Path(
-            os.getenv(
-                "PROCESSED_FILES_PATH",
-                str(Path(self.LOCAL_STATE_DIR) / "processed_files.txt"),
-            )
-        )
-
-    @property
-    def nextcloud_url(self) -> str:
-        """Get Nextcloud URL with validation"""
-        if not self.NEXTCLOUD_URL and self.is_production:
-            raise ConfigurationError("NEXTCLOUD_URL is required in production mode")
-        return self.NEXTCLOUD_URL
-
-    @property
-    def nextcloud_user(self) -> str:
-        """Get Nextcloud username with validation"""
-        if not self.NEXTCLOUD_USER and self.is_production:
-            raise ConfigurationError("NEXTCLOUD_USER is required in production mode")
-        return self.NEXTCLOUD_USER
-
-    @property
-    def nextcloud_password(self) -> str:
-        """Get Nextcloud password with validation"""
-        if not self.NEXTCLOUD_PASSWORD and self.is_production:
-            raise ConfigurationError(
-                "NEXTCLOUD_PASSWORD is required in production mode"
-            )
-        return self.NEXTCLOUD_PASSWORD
-
-    @property
-    def valid_webdav_config(self) -> bool:
-        """Validate WebDAV configuration completeness"""
-        try:
-            _ = self.nextcloud_url
-            _ = self.nextcloud_user
-            _ = self.nextcloud_password
-            return True
-        except ConfigurationError:
-            return False
-
-    @property
-    def telegram_configured(self) -> bool:
-        """Check if Telegram is properly configured"""
+    def has_telegram_config(self) -> bool:
+        """Verifica si hay configuración de Telegram."""
        return bool(self.TELEGRAM_TOKEN and self.TELEGRAM_CHAT_ID)

    @property
-    def has_gpu_support(self) -> bool:
-        """Check if GPU support is available"""
-        try:
-            import torch
-
-            return torch.cuda.is_available()
-        except ImportError:
-            return False
-
-    @property
-    def environment_type(self) -> str:
-        """Get environment type as string"""
-        return "production" if self.is_production else "development"
-
-    @property
-    def config_summary(self) -> dict:
-        """Get configuration summary for logging"""
-        return {
-            "app_name": self.APP_NAME,
-            "version": self.APP_VERSION,
-            "environment": self.environment_type,
-            "debug": self.DEBUG,
-            "webdav_configured": self.has_webdav_config,
-            "ai_configured": self.has_ai_config,
-            "telegram_configured": self.telegram_configured,
-            "gpu_support": self.has_gpu_support,
-            "cpu_count": self.CPU_COUNT,
-            "poll_interval": self.POLL_INTERVAL,
-        }
+    def is_production(self) -> bool:
+        """Verifica si está en modo producción."""
+        return os.getenv("ENV", "development") == "production"


-# Create global settings instance
+# Instancia global de configuración
 settings = Settings()