feat: Integración automática con Notion + análisis completo del código

- Instalado notion-client SDK oficial para integración robusta - Refactorizado services/notion_service.py con SDK oficial de Notion - Rate limiting con retry y exponential backoff - Parser Markdown → Notion blocks (headings, bullets, paragraphs) - Soporte para pages y databases - Manejo robusto de errores - Integración automática en document/generators.py - PDFs se suben automáticamente a Notion después de generarse - Contenido completo del resumen formateado con bloques - Metadata rica (tipo de archivo, path, fecha) - Configuración de Notion en main.py - Inicialización automática al arrancar el servicio - Validación de credenciales - Actualizado config/settings.py - Agregado load_dotenv() para cargar variables de .env - Configuración de Notion (NOTION_API, NOTION_DATABASE_ID) - Scripts de utilidad creados: - test_notion_integration.py: Test de subida a Notion - test_pipeline_notion.py: Test del pipeline completo - verify_notion_permissions.py: Verificación de permisos - list_notion_pages.py: Listar páginas accesibles - diagnose_notion.py: Diagnóstico completo - create_notion_database.py: Crear database automáticamente - restart_service.sh: Script de reinicio del servicio - Documentación completa en opus.md: - Análisis exhaustivo del codebase (42 archivos Python) - Bugs críticos identificados y soluciones - Mejoras de seguridad (autenticación, rate limiting, CORS, CSP) - Optimizaciones de rendimiento (Celery, Redis, PostgreSQL, WebSockets) - Plan de testing (estructura, ejemplos, 80% coverage goal) - Roadmap de implementación (6 sprints detallados) - Integración avanzada con Notion documentada Estado: Notion funcionando correctamente, PDFs se suben automáticamente
2026-01-26 17:26:50 +00:00
parent 47896fd50a
commit 6058dc642e
12 changed files with 3863 additions and 184 deletions
--- a/main.py
+++ b/main.py
@@ -3,6 +3,7 @@
 CBCFacil - Main Service Entry Point
 Unified AI service for document processing (audio, PDF, text)
 """
+
 import logging
 import sys
 import time
@@ -16,12 +17,14 @@ from typing import Optional

 # Load environment variables from .env file
 from dotenv import load_dotenv
+
 load_dotenv()

+
 # Configure logging with JSON formatter for production
 class JSONFormatter(logging.Formatter):
    """JSON formatter for structured logging in production"""
-    
+
    def format(self, record):
        log_entry = {
            "timestamp": datetime.utcnow().isoformat() + "Z",
@@ -29,43 +32,43 @@ class JSONFormatter(logging.Formatter):
            "message": record.getMessage(),
            "module": record.module,
            "function": record.funcName,
-            "line": record.lineno
+            "line": record.lineno,
        }
-        
+
        # Add exception info if present
        if record.exc_info:
            log_entry["exception"] = self.formatException(record.exc_info)
-        
+
        return json.dumps(log_entry)


 def setup_logging() -> logging.Logger:
    """Setup logging configuration"""
    from config import settings
-    
+
    # Create logger
    logger = logging.getLogger(__name__)
    logger.setLevel(getattr(logging, settings.LOG_LEVEL.upper()))
-    
+
    # Remove existing handlers
    logger.handlers.clear()
-    
+
    # Console handler
    console_handler = logging.StreamHandler(sys.stdout)
    if settings.is_production:
        console_handler.setFormatter(JSONFormatter())
    else:
-        console_handler.setFormatter(logging.Formatter(
-            "%(asctime)s [%(levelname)s] - %(name)s - %(message)s"
-        ))
+        console_handler.setFormatter(
+            logging.Formatter("%(asctime)s [%(levelname)s] - %(name)s - %(message)s")
+        )
    logger.addHandler(console_handler)
-    
+
    # File handler if configured
    if settings.LOG_FILE:
        file_handler = logging.FileHandler(settings.LOG_FILE)
        file_handler.setFormatter(JSONFormatter())
        logger.addHandler(file_handler)
-    
+
    return logger


@@ -74,9 +77,12 @@ logger = setup_logging()

 def acquire_lock() -> int:
    """Acquire single instance lock"""
-    lock_file = Path(os.getenv("LOCAL_STATE_DIR", str(Path(__file__).parent))) / ".main_service.lock"
+    lock_file = (
+        Path(os.getenv("LOCAL_STATE_DIR", str(Path(__file__).parent)))
+        / ".main_service.lock"
+    )
    lock_file.parent.mkdir(parents=True, exist_ok=True)
-    lock_fd = open(lock_file, 'w')
+    lock_fd = open(lock_file, "w")
    fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
    lock_fd.write(str(os.getpid()))
    lock_fd.flush()
@@ -96,11 +102,13 @@ def release_lock(lock_fd) -> None:
 def validate_configuration() -> None:
    """Validate configuration at startup"""
    from config.validators import validate_environment, ConfigurationError
-    
+
    try:
        warnings = validate_environment()
        if warnings:
-            logger.info(f"Configuration validation completed with {len(warnings)} warnings")
+            logger.info(
+                f"Configuration validation completed with {len(warnings)} warnings"
+            )
    except ConfigurationError as e:
        logger.error(f"Configuration validation failed: {e}")
        raise
@@ -113,13 +121,13 @@ def check_service_health() -> dict:
    """
    from config import settings
    from services.webdav_service import webdav_service
-    
+
    health_status = {
        "timestamp": datetime.utcnow().isoformat(),
        "status": "healthy",
-        "services": {}
+        "services": {},
    }
-    
+
    # Check WebDAV
    try:
        if settings.has_webdav_config:
@@ -129,15 +137,13 @@ def check_service_health() -> dict:
        else:
            health_status["services"]["webdav"] = {"status": "not_configured"}
    except Exception as e:
-        health_status["services"]["webdav"] = {
-            "status": "unhealthy",
-            "error": str(e)
-        }
+        health_status["services"]["webdav"] = {"status": "unhealthy", "error": str(e)}
        health_status["status"] = "degraded"
-    
+
    # Check Telegram
    try:
        from services.telegram_service import telegram_service
+
        if telegram_service.is_configured:
            health_status["services"]["telegram"] = {"status": "healthy"}
        else:
@@ -145,23 +151,21 @@ def check_service_health() -> dict:
    except Exception as e:
        health_status["services"]["telegram"] = {
            "status": "unavailable",
-            "error": str(e)
+            "error": str(e),
        }
-    
+
    # Check VRAM manager
    try:
        from services.vram_manager import vram_manager
+
        vram_info = vram_manager.get_vram_info()
        health_status["services"]["vram"] = {
            "status": "healthy",
-            "available_gb": vram_info.get("free", 0) / (1024**3)
+            "available_gb": vram_info.get("free", 0) / (1024**3),
        }
    except Exception as e:
-        health_status["services"]["vram"] = {
-            "status": "unavailable",
-            "error": str(e)
-        }
-    
+        health_status["services"]["vram"] = {"status": "unavailable", "error": str(e)}
+
    return health_status


@@ -172,29 +176,45 @@ def initialize_services() -> None:
    from services.vram_manager import vram_manager
    from services.telegram_service import telegram_service
    from storage.processed_registry import processed_registry
-    
+
    logger.info("Initializing services...")
-    
+
    # Validate configuration
    validate_configuration()
-    
+
    # Warn if WebDAV not configured
    if not settings.has_webdav_config:
        logger.warning("WebDAV not configured - file sync functionality disabled")
-    
+
    # Warn if AI providers not configured
    if not settings.has_ai_config:
        logger.warning("AI providers not configured - summary generation will not work")
-    
+
    # Configure Telegram if credentials available
    if settings.TELEGRAM_TOKEN and settings.TELEGRAM_CHAT_ID:
        try:
-            telegram_service.configure(settings.TELEGRAM_TOKEN, settings.TELEGRAM_CHAT_ID)
+            telegram_service.configure(
+                settings.TELEGRAM_TOKEN, settings.TELEGRAM_CHAT_ID
+            )
            telegram_service.send_start_notification()
            logger.info("Telegram notifications enabled")
        except Exception as e:
            logger.error(f"Failed to configure Telegram: {e}")
-    
+
+    # Configure Notion if credentials available
+    if settings.has_notion_config:
+        try:
+            from services.notion_service import notion_service
+
+            notion_service.configure(
+                settings.NOTION_API_TOKEN, settings.NOTION_DATABASE_ID
+            )
+            logger.info("✅ Notion integration enabled")
+        except Exception as e:
+            logger.error(f"Failed to configure Notion: {e}")
+    else:
+        logger.info("Notion not configured - upload to Notion disabled")
+
    # Initialize WebDAV if configured
    if settings.has_webdav_config:
        try:
@@ -205,7 +225,7 @@ def initialize_services() -> None:
            logger.exception("WebDAV initialization error details")
    else:
        logger.info("Skipping WebDAV initialization (not configured)")
-    
+
    # Initialize VRAM manager
    try:
        vram_manager.initialize()
@@ -213,7 +233,7 @@ def initialize_services() -> None:
    except Exception as e:
        logger.error(f"Failed to initialize VRAM manager: {e}")
        logger.exception("VRAM manager initialization error details")
-    
+
    # Initialize processed registry
    try:
        processed_registry.initialize()
@@ -221,11 +241,11 @@ def initialize_services() -> None:
    except Exception as e:
        logger.error(f"Failed to initialize processed registry: {e}")
        logger.exception("Registry initialization error details")
-    
+
    # Run health check
    health = check_service_health()
    logger.info(f"Initial health check: {json.dumps(health, indent=2)}")
-    
+
    logger.info("All services initialized successfully")


@@ -233,6 +253,7 @@ def send_error_notification(error_type: str, error_message: str) -> None:
    """Send error notification via Telegram"""
    try:
        from services.telegram_service import telegram_service
+
        if telegram_service.is_configured:
            telegram_service.send_error_notification(error_type, error_message)
    except Exception as e:
@@ -243,15 +264,16 @@ def run_dashboard_thread() -> None:
    """Run Flask dashboard in a separate thread"""
    try:
        from api.routes import create_app
+
        app = create_app()

        # Run Flask in production mode with threaded=True
        app.run(
-            host='0.0.0.0',
+            host="0.0.0.0",
            port=5000,
            debug=False,
            threaded=True,
-            use_reloader=False  # Important: disable reloader in thread
+            use_reloader=False,  # Important: disable reloader in thread
        )
    except Exception as e:
        logger.error(f"Dashboard thread error: {e}")
@@ -260,14 +282,12 @@ def run_dashboard_thread() -> None:

 def start_dashboard() -> threading.Thread:
    """Start dashboard in a background daemon thread"""
-    dashboard_port = int(os.getenv('DASHBOARD_PORT', '5000'))
+    dashboard_port = int(os.getenv("DASHBOARD_PORT", "5000"))
    logger.info(f"Starting dashboard on port {dashboard_port}...")

    # Create daemon thread so it doesn't block shutdown
    dashboard_thread = threading.Thread(
-        target=run_dashboard_thread,
-        name="DashboardThread",
-        daemon=True
+        target=run_dashboard_thread, name="DashboardThread", daemon=True
    )
    dashboard_thread.start()
    logger.info(f"Dashboard thread started (Thread-ID: {dashboard_thread.ident})")
@@ -282,109 +302,169 @@ def run_main_loop() -> None:
    from processors.audio_processor import AudioProcessor
    from processors.pdf_processor import PDFProcessor
    from processors.text_processor import TextProcessor
-    
+
    audio_processor = AudioProcessor()
    pdf_processor = PDFProcessor()
    text_processor = TextProcessor()
-    
+
    consecutive_errors = 0
    max_consecutive_errors = 5
-    
+
    while True:
        try:
            logger.info("--- Polling for new files ---")
            processed_registry.load()
-            
+
            # Process PDFs
            if settings.has_webdav_config:
                try:
                    webdav_service.mkdir(settings.REMOTE_PDF_FOLDER)
                    pdf_files = webdav_service.list(settings.REMOTE_PDF_FOLDER)
                    for file_path in pdf_files:
-                        if file_path.lower().endswith('.pdf'):
+                        if file_path.lower().endswith(".pdf"):
                            if not processed_registry.is_processed(file_path):
-                                pdf_processor.process(file_path)
+                                from pathlib import Path
+                                from urllib.parse import unquote
+                                from services.telegram_service import telegram_service
+
+                                local_filename = unquote(Path(file_path).name)
+                                base_name = Path(local_filename).stem
+                                local_path = (
+                                    settings.LOCAL_DOWNLOADS_PATH / local_filename
+                                )
+                                settings.LOCAL_DOWNLOADS_PATH.mkdir(
+                                    parents=True, exist_ok=True
+                                )
+
+                                # Step 1: Notify and download
+                                telegram_service.send_message(
+                                    f"📄 Nuevo PDF detectado: {local_filename}\n"
+                                    f"⬇️ Descargando..."
+                                )
+                                logger.info(
+                                    f"Downloading PDF: {file_path} -> {local_path}"
+                                )
+                                webdav_service.download(file_path, local_path)
+
+                                # Step 2: Process PDF
+                                telegram_service.send_message(
+                                    f"🔍 Procesando PDF con OCR..."
+                                )
+                                pdf_processor.process(str(local_path))
+
                                processed_registry.save(file_path)
                except Exception as e:
                    logger.exception(f"Error processing PDFs: {e}")
                    send_error_notification("pdf_processing", str(e))
-            
+
            # Process Audio files
            if settings.has_webdav_config:
                try:
                    audio_files = webdav_service.list(settings.REMOTE_AUDIOS_FOLDER)
                    for file_path in audio_files:
-                        if any(file_path.lower().endswith(ext) for ext in settings.AUDIO_EXTENSIONS):
+                        if any(
+                            file_path.lower().endswith(ext)
+                            for ext in settings.AUDIO_EXTENSIONS
+                        ):
                            if not processed_registry.is_processed(file_path):
                                from pathlib import Path
                                from urllib.parse import unquote
                                from document.generators import DocumentGenerator
                                from services.telegram_service import telegram_service
-                                
+
                                local_filename = unquote(Path(file_path).name)
                                base_name = Path(local_filename).stem
-                                local_path = settings.LOCAL_DOWNLOADS_PATH / local_filename
-                                settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True)
-                                
+                                local_path = (
+                                    settings.LOCAL_DOWNLOADS_PATH / local_filename
+                                )
+                                settings.LOCAL_DOWNLOADS_PATH.mkdir(
+                                    parents=True, exist_ok=True
+                                )
+
                                # Step 1: Notify and download
                                telegram_service.send_message(
                                    f"🎵 Nuevo audio detectado: {local_filename}\n"
                                    f"⬇️ Descargando..."
                                )
-                                logger.info(f"Downloading audio: {file_path} -> {local_path}")
+                                logger.info(
+                                    f"Downloading audio: {file_path} -> {local_path}"
+                                )
                                webdav_service.download(file_path, local_path)
-                                
+
                                # Step 2: Transcribe
-                                telegram_service.send_message(f"📝 Transcribiendo audio con Whisper...")
+                                telegram_service.send_message(
+                                    f"📝 Transcribiendo audio con Whisper..."
+                                )
                                result = audio_processor.process(str(local_path))
-                                
-                                if result.get("success") and result.get("transcription_path"):
-                                    transcription_file = Path(result["transcription_path"])
-                                    transcription_text = result.get("text", "")
-                                    
-                                    # Step 3: Generate AI summary and documents
-                                    telegram_service.send_message(f"🤖 Generando resumen con IA...")
-                                    doc_generator = DocumentGenerator()
-                                    success, summary, output_files = doc_generator.generate_summary(
-                                        transcription_text, base_name
+
+                                if result.get("success") and result.get(
+                                    "transcription_path"
+                                ):
+                                    transcription_file = Path(
+                                        result["transcription_path"]
                                    )
-                                    
+                                    transcription_text = result.get("text", "")
+
+                                    # Step 3: Generate AI summary and documents
+                                    telegram_service.send_message(
+                                        f"🤖 Generando resumen con IA..."
+                                    )
+                                    doc_generator = DocumentGenerator()
+                                    success, summary, output_files = (
+                                        doc_generator.generate_summary(
+                                            transcription_text, base_name
+                                        )
+                                    )
+
                                    # Step 4: Upload all files to Nextcloud
                                    if success and output_files:
                                        # Create folders
-                                        for folder in [settings.RESUMENES_FOLDER, settings.DOCX_FOLDER]:
+                                        for folder in [
+                                            settings.RESUMENES_FOLDER,
+                                            settings.DOCX_FOLDER,
+                                        ]:
                                            try:
                                                webdav_service.makedirs(folder)
                                            except Exception:
                                                pass
-                                        
+
                                        # Upload transcription TXT
                                        if transcription_file.exists():
                                            remote_txt = f"{settings.RESUMENES_FOLDER}/{transcription_file.name}"
-                                            webdav_service.upload(transcription_file, remote_txt)
+                                            webdav_service.upload(
+                                                transcription_file, remote_txt
+                                            )
                                            logger.info(f"Uploaded: {remote_txt}")
-                                        
+
                                        # Upload DOCX
-                                        docx_path = Path(output_files.get('docx_path', ''))
+                                        docx_path = Path(
+                                            output_files.get("docx_path", "")
+                                        )
                                        if docx_path.exists():
                                            remote_docx = f"{settings.DOCX_FOLDER}/{docx_path.name}"
-                                            webdav_service.upload(docx_path, remote_docx)
+                                            webdav_service.upload(
+                                                docx_path, remote_docx
+                                            )
                                            logger.info(f"Uploaded: {remote_docx}")
-                                        
+
                                        # Upload PDF
-                                        pdf_path = Path(output_files.get('pdf_path', ''))
+                                        pdf_path = Path(
+                                            output_files.get("pdf_path", "")
+                                        )
                                        if pdf_path.exists():
                                            remote_pdf = f"{settings.DOCX_FOLDER}/{pdf_path.name}"
                                            webdav_service.upload(pdf_path, remote_pdf)
                                            logger.info(f"Uploaded: {remote_pdf}")
-                                        
+
                                        # Upload Markdown
-                                        md_path = Path(output_files.get('markdown_path', ''))
+                                        md_path = Path(
+                                            output_files.get("markdown_path", "")
+                                        )
                                        if md_path.exists():
                                            remote_md = f"{settings.RESUMENES_FOLDER}/{md_path.name}"
                                            webdav_service.upload(md_path, remote_md)
                                            logger.info(f"Uploaded: {remote_md}")
-                                        
+
                                        # Final notification
                                        telegram_service.send_message(
                                            f"✅ Audio procesado: {local_filename}\n"
@@ -396,46 +476,53 @@ def run_main_loop() -> None:
                                        # Just upload transcription if summary failed
                                        if transcription_file.exists():
                                            try:
-                                                webdav_service.makedirs(settings.RESUMENES_FOLDER)
+                                                webdav_service.makedirs(
+                                                    settings.RESUMENES_FOLDER
+                                                )
                                            except Exception:
                                                pass
                                            remote_txt = f"{settings.RESUMENES_FOLDER}/{transcription_file.name}"
-                                            webdav_service.upload(transcription_file, remote_txt)
+                                            webdav_service.upload(
+                                                transcription_file, remote_txt
+                                            )
                                            telegram_service.send_message(
                                                f"⚠️ Resumen fallido, solo transcripción subida:\n{transcription_file.name}"
                                            )
-                                
+
                                processed_registry.save(file_path)
                except Exception as e:
                    logger.exception(f"Error processing audio: {e}")
                    send_error_notification("audio_processing", str(e))
-            
+
            # Process Text files
            if settings.has_webdav_config:
                try:
                    text_files = webdav_service.list(settings.REMOTE_TXT_FOLDER)
                    for file_path in text_files:
-                        if any(file_path.lower().endswith(ext) for ext in settings.TXT_EXTENSIONS):
+                        if any(
+                            file_path.lower().endswith(ext)
+                            for ext in settings.TXT_EXTENSIONS
+                        ):
                            if not processed_registry.is_processed(file_path):
                                text_processor.process(file_path)
                                processed_registry.save(file_path)
                except Exception as e:
                    logger.exception(f"Error processing text: {e}")
                    send_error_notification("text_processing", str(e))
-            
+
            # Reset error counter on success
            consecutive_errors = 0
-            
+
        except Exception as e:
            # Improved error logging with full traceback
            logger.exception(f"Critical error in main loop: {e}")
-            
+
            # Send notification for critical errors
            send_error_notification("main_loop", str(e))
-            
+
            # Track consecutive errors
            consecutive_errors += 1
-            
+
            if consecutive_errors >= max_consecutive_errors:
                logger.critical(
                    f"Too many consecutive errors ({consecutive_errors}). "
@@ -443,14 +530,14 @@ def run_main_loop() -> None:
                )
                send_error_notification(
                    "consecutive_errors",
-                    f"Service has failed {consecutive_errors} consecutive times"
+                    f"Service has failed {consecutive_errors} consecutive times",
                )
-            
+
            # Don't exit, let the loop continue with backoff
            logger.info(f"Waiting {settings.POLL_INTERVAL * 2} seconds before retry...")
            time.sleep(settings.POLL_INTERVAL * 2)
            continue
-        
+
        logger.info(f"Cycle completed. Waiting {settings.POLL_INTERVAL} seconds...")
        time.sleep(settings.POLL_INTERVAL)

@@ -462,7 +549,9 @@ def main():
    try:
        logger.info("=== CBCFacil Service Started ===")
        logger.info(f"Version: {os.getenv('APP_VERSION', '8.0')}")
-        logger.info(f"Environment: {'production' if os.getenv('DEBUG', 'false').lower() != 'true' else 'development'}")
+        logger.info(
+            f"Environment: {'production' if os.getenv('DEBUG', 'false').lower() != 'true' else 'development'}"
+        )

        lock_fd = acquire_lock()
        initialize_services()
@@ -472,7 +561,7 @@ def main():

        # Run main processing loop
        run_main_loop()
-        
+
    except KeyboardInterrupt:
        logger.info("Shutdown requested by user")
    except Exception as e:
@@ -491,12 +580,15 @@ if __name__ == "__main__":
        command = sys.argv[1]
        if command == "whisper" and len(sys.argv) == 4:
            from processors.audio_processor import AudioProcessor
+
            AudioProcessor().process(sys.argv[2])
        elif command == "pdf" and len(sys.argv) == 4:
            from processors.pdf_processor import PDFProcessor
+
            PDFProcessor().process(sys.argv[2])
        elif command == "health":
            from main import check_service_health
+
            health = check_service_health()
            print(json.dumps(health, indent=2))
        else: