feat: Integración automática con Notion + análisis completo del código

- Instalado notion-client SDK oficial para integración robusta - Refactorizado services/notion_service.py con SDK oficial de Notion - Rate limiting con retry y exponential backoff - Parser Markdown → Notion blocks (headings, bullets, paragraphs) - Soporte para pages y databases - Manejo robusto de errores - Integración automática en document/generators.py - PDFs se suben automáticamente a Notion después de generarse - Contenido completo del resumen formateado con bloques - Metadata rica (tipo de archivo, path, fecha) - Configuración de Notion en main.py - Inicialización automática al arrancar el servicio - Validación de credenciales - Actualizado config/settings.py - Agregado load_dotenv() para cargar variables de .env - Configuración de Notion (NOTION_API, NOTION_DATABASE_ID) - Scripts de utilidad creados: - test_notion_integration.py: Test de subida a Notion - test_pipeline_notion.py: Test del pipeline completo - verify_notion_permissions.py: Verificación de permisos - list_notion_pages.py: Listar páginas accesibles - diagnose_notion.py: Diagnóstico completo - create_notion_database.py: Crear database automáticamente - restart_service.sh: Script de reinicio del servicio - Documentación completa en opus.md: - Análisis exhaustivo del codebase (42 archivos Python) - Bugs críticos identificados y soluciones - Mejoras de seguridad (autenticación, rate limiting, CORS, CSP) - Optimizaciones de rendimiento (Celery, Redis, PostgreSQL, WebSockets) - Plan de testing (estructura, ejemplos, 80% coverage goal) - Roadmap de implementación (6 sprints detallados) - Integración avanzada con Notion documentada Estado: Notion funcionando correctamente, PDFs se suben automáticamente
2026-01-26 17:26:50 +00:00
parent 47896fd50a
commit 6058dc642e
12 changed files with 3863 additions and 184 deletions
--- a/config/settings.py
+++ b/config/settings.py
@@ -1,19 +1,25 @@
 """
 Centralized configuration management for CBCFacil
 """
+
 import os
 from pathlib import Path
 from typing import Optional, Set, Union
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()


 class ConfigurationError(Exception):
    """Raised when configuration is invalid"""
+
    pass


 class Settings:
    """Application settings loaded from environment variables"""
-    
+
    # Application
    APP_NAME: str = "CBCFacil"
    APP_VERSION: str = "8.0"
@@ -44,7 +50,9 @@ class Settings:
    POLL_INTERVAL: int = int(os.getenv("POLL_INTERVAL", "5"))
    HTTP_TIMEOUT: int = int(os.getenv("HTTP_TIMEOUT", "30"))
    WEBDAV_MAX_RETRIES: int = int(os.getenv("WEBDAV_MAX_RETRIES", "3"))
-    DOWNLOAD_CHUNK_SIZE: int = int(os.getenv("DOWNLOAD_CHUNK_SIZE", "65536"))  # 64KB for better performance
+    DOWNLOAD_CHUNK_SIZE: int = int(
+        os.getenv("DOWNLOAD_CHUNK_SIZE", "65536")
+    )  # 64KB for better performance
    MAX_FILENAME_LENGTH: int = int(os.getenv("MAX_FILENAME_LENGTH", "80"))
    MAX_FILENAME_BASE_LENGTH: int = int(os.getenv("MAX_FILENAME_BASE_LENGTH", "40"))
    MAX_FILENAME_TOPICS_LENGTH: int = int(os.getenv("MAX_FILENAME_TOPICS_LENGTH", "20"))
@@ -57,7 +65,13 @@ class Settings:
    # AI Providers
    ZAI_BASE_URL: str = os.getenv("ZAI_BASE_URL", "https://api.z.ai/api/anthropic")
    ZAI_DEFAULT_MODEL: str = os.getenv("ZAI_MODEL", "glm-4.6")
-    ZAI_AUTH_TOKEN: Optional[str] = os.getenv("ANTHROPIC_AUTH_TOKEN") or os.getenv("ZAI_AUTH_TOKEN", "")
+    ZAI_AUTH_TOKEN: Optional[str] = os.getenv("ANTHROPIC_AUTH_TOKEN") or os.getenv(
+        "ZAI_AUTH_TOKEN", ""
+    )
+
+    # Notion Integration
+    NOTION_API_TOKEN: Optional[str] = os.getenv("NOTION_API")
+    NOTION_DATABASE_ID: Optional[str] = os.getenv("NOTION_DATABASE_ID")

    # Gemini
    GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
@@ -76,13 +90,25 @@ class Settings:
    CPU_COUNT: int = os.cpu_count() or 1
    PDF_MAX_PAGES_PER_CHUNK: int = int(os.getenv("PDF_MAX_PAGES_PER_CHUNK", "2"))
    PDF_DPI: int = int(os.getenv("PDF_DPI", "200"))
-    PDF_RENDER_THREAD_COUNT: int = int(os.getenv("PDF_RENDER_THREAD_COUNT", str(min(4, CPU_COUNT))))
+    PDF_RENDER_THREAD_COUNT: int = int(
+        os.getenv("PDF_RENDER_THREAD_COUNT", str(min(4, CPU_COUNT)))
+    )
    PDF_BATCH_SIZE: int = int(os.getenv("PDF_BATCH_SIZE", "2"))
-    PDF_TROCR_MAX_BATCH: int = int(os.getenv("PDF_TROCR_MAX_BATCH", str(PDF_BATCH_SIZE)))
-    PDF_TESSERACT_THREADS: int = int(os.getenv("PDF_TESSERACT_THREADS", str(max(1, min(2, max(1, CPU_COUNT // 3))))))
-    PDF_PREPROCESS_THREADS: int = int(os.getenv("PDF_PREPROCESS_THREADS", str(PDF_TESSERACT_THREADS)))
-    PDF_TEXT_DETECTION_MIN_RATIO: float = float(os.getenv("PDF_TEXT_DETECTION_MIN_RATIO", "0.6"))
-    PDF_TEXT_DETECTION_MIN_AVG_CHARS: int = int(os.getenv("PDF_TEXT_DETECTION_MIN_AVG_CHARS", "120"))
+    PDF_TROCR_MAX_BATCH: int = int(
+        os.getenv("PDF_TROCR_MAX_BATCH", str(PDF_BATCH_SIZE))
+    )
+    PDF_TESSERACT_THREADS: int = int(
+        os.getenv("PDF_TESSERACT_THREADS", str(max(1, min(2, max(1, CPU_COUNT // 3)))))
+    )
+    PDF_PREPROCESS_THREADS: int = int(
+        os.getenv("PDF_PREPROCESS_THREADS", str(PDF_TESSERACT_THREADS))
+    )
+    PDF_TEXT_DETECTION_MIN_RATIO: float = float(
+        os.getenv("PDF_TEXT_DETECTION_MIN_RATIO", "0.6")
+    )
+    PDF_TEXT_DETECTION_MIN_AVG_CHARS: int = int(
+        os.getenv("PDF_TEXT_DETECTION_MIN_AVG_CHARS", "120")
+    )

    # Error handling
    ERROR_THROTTLE_SECONDS: int = int(os.getenv("ERROR_THROTTLE_SECONDS", "600"))
@@ -90,8 +116,10 @@ class Settings:
    # GPU/VRAM Management
    MODEL_TIMEOUT_SECONDS: int = int(os.getenv("MODEL_TIMEOUT_SECONDS", "300"))
    CUDA_VISIBLE_DEVICES: str = os.getenv("CUDA_VISIBLE_DEVICES", "all")
-    PYTORCH_CUDA_ALLOC_CONF: str = os.getenv("PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:512")
-    
+    PYTORCH_CUDA_ALLOC_CONF: str = os.getenv(
+        "PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:512"
+    )
+
    # GPU Detection (auto, nvidia, amd, cpu)
    GPU_PREFERENCE: str = os.getenv("GPU_PREFERENCE", "auto")
    # AMD ROCm HSA override for RX 6000 series (gfx1030)
@@ -113,53 +141,67 @@ class Settings:
    # ========================================================================
    # PROPERTIES WITH VALIDATION
    # ========================================================================
-    
+
    @property
    def is_production(self) -> bool:
        """Check if running in production mode"""
        return not self.DEBUG
-    
+
    @property
    def has_webdav_config(self) -> bool:
        """Check if WebDAV credentials are configured"""
        return all([self.NEXTCLOUD_URL, self.NEXTCLOUD_USER, self.NEXTCLOUD_PASSWORD])
-    
+
    @property
    def has_ai_config(self) -> bool:
        """Check if AI providers are configured"""
-        return any([
-            self.ZAI_AUTH_TOKEN,
-            self.GEMINI_API_KEY,
-            self.CLAUDE_CLI_PATH,
-            self.GEMINI_CLI_PATH
-        ])
-    
+        return any(
+            [
+                self.ZAI_AUTH_TOKEN,
+                self.GEMINI_API_KEY,
+                self.CLAUDE_CLI_PATH,
+                self.GEMINI_CLI_PATH,
+            ]
+        )
+
+    @property
+    def has_notion_config(self) -> bool:
+        """Check if Notion is configured"""
+        return bool(self.NOTION_API_TOKEN and self.NOTION_DATABASE_ID)
+
    @property
    def processed_files_path(self) -> Path:
        """Get the path to the processed files registry"""
-        return Path(os.getenv("PROCESSED_FILES_PATH", str(Path(self.LOCAL_STATE_DIR) / "processed_files.txt")))
-    
+        return Path(
+            os.getenv(
+                "PROCESSED_FILES_PATH",
+                str(Path(self.LOCAL_STATE_DIR) / "processed_files.txt"),
+            )
+        )
+
    @property
    def nextcloud_url(self) -> str:
        """Get Nextcloud URL with validation"""
        if not self.NEXTCLOUD_URL and self.is_production:
            raise ConfigurationError("NEXTCLOUD_URL is required in production mode")
        return self.NEXTCLOUD_URL
-    
+
    @property
    def nextcloud_user(self) -> str:
        """Get Nextcloud username with validation"""
        if not self.NEXTCLOUD_USER and self.is_production:
            raise ConfigurationError("NEXTCLOUD_USER is required in production mode")
        return self.NEXTCLOUD_USER
-    
+
    @property
    def nextcloud_password(self) -> str:
        """Get Nextcloud password with validation"""
        if not self.NEXTCLOUD_PASSWORD and self.is_production:
-            raise ConfigurationError("NEXTCLOUD_PASSWORD is required in production mode")
+            raise ConfigurationError(
+                "NEXTCLOUD_PASSWORD is required in production mode"
+            )
        return self.NEXTCLOUD_PASSWORD
-    
+
    @property
    def valid_webdav_config(self) -> bool:
        """Validate WebDAV configuration completeness"""
@@ -170,26 +212,27 @@ class Settings:
            return True
        except ConfigurationError:
            return False
-    
+
    @property
    def telegram_configured(self) -> bool:
        """Check if Telegram is properly configured"""
        return bool(self.TELEGRAM_TOKEN and self.TELEGRAM_CHAT_ID)
-    
+
    @property
    def has_gpu_support(self) -> bool:
        """Check if GPU support is available"""
        try:
            import torch
+
            return torch.cuda.is_available()
        except ImportError:
            return False
-    
+
    @property
    def environment_type(self) -> str:
        """Get environment type as string"""
        return "production" if self.is_production else "development"
-    
+
    @property
    def config_summary(self) -> dict:
        """Get configuration summary for logging"""
@@ -203,7 +246,7 @@ class Settings:
            "telegram_configured": self.telegram_configured,
            "gpu_support": self.has_gpu_support,
            "cpu_count": self.CPU_COUNT,
-            "poll_interval": self.POLL_INTERVAL
+            "poll_interval": self.POLL_INTERVAL,
        }