feat: Integración automática con Notion + análisis completo del código

- Instalado notion-client SDK oficial para integración robusta - Refactorizado services/notion_service.py con SDK oficial de Notion - Rate limiting con retry y exponential backoff - Parser Markdown → Notion blocks (headings, bullets, paragraphs) - Soporte para pages y databases - Manejo robusto de errores - Integración automática en document/generators.py - PDFs se suben automáticamente a Notion después de generarse - Contenido completo del resumen formateado con bloques - Metadata rica (tipo de archivo, path, fecha) - Configuración de Notion en main.py - Inicialización automática al arrancar el servicio - Validación de credenciales - Actualizado config/settings.py - Agregado load_dotenv() para cargar variables de .env - Configuración de Notion (NOTION_API, NOTION_DATABASE_ID) - Scripts de utilidad creados: - test_notion_integration.py: Test de subida a Notion - test_pipeline_notion.py: Test del pipeline completo - verify_notion_permissions.py: Verificación de permisos - list_notion_pages.py: Listar páginas accesibles - diagnose_notion.py: Diagnóstico completo - create_notion_database.py: Crear database automáticamente - restart_service.sh: Script de reinicio del servicio - Documentación completa en opus.md: - Análisis exhaustivo del codebase (42 archivos Python) - Bugs críticos identificados y soluciones - Mejoras de seguridad (autenticación, rate limiting, CORS, CSP) - Optimizaciones de rendimiento (Celery, Redis, PostgreSQL, WebSockets) - Plan de testing (estructura, ejemplos, 80% coverage goal) - Roadmap de implementación (6 sprints detallados) - Integración avanzada con Notion documentada Estado: Notion funcionando correctamente, PDFs se suben automáticamente
2026-01-26 17:26:50 +00:00
parent 47896fd50a
commit fb107cbe52
12 changed files with 3863 additions and 184 deletions
--- a/document/generators.py
+++ b/document/generators.py
@@ -1,6 +1,7 @@
 """
 Document generation utilities
 """
+
 import logging
 import re
 from pathlib import Path
@@ -17,7 +18,9 @@ class DocumentGenerator:
        self.logger = logging.getLogger(__name__)
        self.ai_provider = ai_provider_factory.get_best_provider()

-    def generate_summary(self, text: str, base_name: str) -> Tuple[bool, str, Dict[str, Any]]:
+    def generate_summary(
+        self, text: str, base_name: str
+    ) -> Tuple[bool, str, Dict[str, Any]]:
        """Generate unified summary"""
        self.logger.info(f"Generating summary for {base_name}")

@@ -36,7 +39,7 @@ REGLAS ESTRICTAS:

 Texto:
 {text[:15000]}"""  # Truncate to avoid context limits if necessary, though providers handle it differently
-            
+
            try:
                bullet_points = self.ai_provider.generate_text(bullet_prompt)
                self.logger.info(f"Bullet points generated: {len(bullet_points)}")
@@ -85,13 +88,16 @@ Instrucciones:

            # Use generic Gemini provider for formatting as requested
            from services.ai.gemini_provider import GeminiProvider
+
            formatter = GeminiProvider()
-            
+
            try:
                if formatter.is_available():
                    summary = formatter.generate_text(format_prompt)
                else:
-                    self.logger.warning("Gemini formatter not available, using raw summary")
+                    self.logger.warning(
+                        "Gemini formatter not available, using raw summary"
+                    )
                    summary = raw_summary
            except Exception as e:
                self.logger.warning(f"Formatting failed ({e}), using raw summary")
@@ -105,13 +111,51 @@ Instrucciones:
            docx_path = self._create_docx(summary, base_name)
            pdf_path = self._create_pdf(summary, base_name)

+            # Upload to Notion if configured
+            from services.notion_service import notion_service
+
+            notion_uploaded = False
+            notion_page_id = None
+            if settings.has_notion_config:
+                try:
+                    title = base_name.replace("_", " ").title()
+
+                    # Crear página con el contenido completo del resumen
+                    notion_metadata = {
+                        "file_type": "Audio",  # O 'PDF' dependiendo del origen
+                        "pdf_path": pdf_path,
+                        "add_status": False,  # No usar Status/Tipo (no existen en la DB)
+                        "use_as_page": False,  # Usar como database, no página
+                    }
+
+                    notion_page_id = notion_service.create_page_with_summary(
+                        title=title, summary=summary, metadata=notion_metadata
+                    )
+
+                    if notion_page_id:
+                        notion_uploaded = True
+                        self.logger.info(
+                            f"✅ Resumen subido a Notion: {title} (ID: {notion_page_id})"
+                        )
+                    else:
+                        self.logger.warning(f"⚠️ No se pudo subir a Notion: {title}")
+                except Exception as e:
+                    self.logger.warning(f"❌ Error al subir a Notion: {e}")
+                    import traceback
+
+                    traceback.print_exc()
+            else:
+                self.logger.info("Notion not configured - skipping upload")
+
            metadata = {
-                'markdown_path': str(markdown_path),
-                'docx_path': str(docx_path),
-                'pdf_path': str(pdf_path),
-                'docx_name': Path(docx_path).name,
-                'summary': summary,
-                'filename': filename
+                "markdown_path": str(markdown_path),
+                "docx_path": str(docx_path),
+                "pdf_path": str(pdf_path),
+                "docx_name": Path(docx_path).name,
+                "summary": summary,
+                "filename": filename,
+                "notion_uploaded": notion_uploaded,
+                "notion_page_id": notion_page_id,
            }

            return True, summary, metadata
@@ -129,22 +173,26 @@ Summary: {summary}

 Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""

-            topics_text = self.ai_provider.sanitize_input(prompt) if hasattr(self.ai_provider, 'sanitize_input') else summary[:100]
+            topics_text = (
+                self.ai_provider.sanitize_input(prompt)
+                if hasattr(self.ai_provider, "sanitize_input")
+                else summary[:100]
+            )

            # Simple topic extraction
-            topics = re.findall(r'\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b', topics_text)[:3]
+            topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3]
            if not topics:
-                topics = ['documento']
+                topics = ["documento"]

            # Limit topic length
-            topics = [t[:settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
+            topics = [t[: settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]

-            filename = '_'.join(topics)[:settings.MAX_FILENAME_LENGTH]
+            filename = "_".join(topics)[: settings.MAX_FILENAME_LENGTH]
            return filename

        except Exception as e:
            self.logger.error(f"Filename generation failed: {e}")
-            return base_name[:settings.MAX_FILENAME_BASE_LENGTH]
+            return base_name[: settings.MAX_FILENAME_BASE_LENGTH]

    def _create_markdown(self, summary: str, base_name: str) -> Path:
        """Create Markdown document"""
@@ -153,7 +201,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""

        output_path = output_dir / f"{base_name}_unificado.md"

-        content = f"""# {base_name.replace('_', ' ').title()}
+        content = f"""# {base_name.replace("_", " ").title()}

 ## Resumen

@@ -164,7 +212,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
 *Generado por CBCFacil*
 """

-        with open(output_path, 'w', encoding='utf-8') as f:
+        with open(output_path, "w", encoding="utf-8") as f:
            f.write(content)

        return output_path
@@ -183,51 +231,53 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
        output_path = output_dir / f"{base_name}_unificado.docx"

        doc = Document()
-        doc.add_heading(base_name.replace('_', ' ').title(), 0)
+        doc.add_heading(base_name.replace("_", " ").title(), 0)

        # Parse and render Markdown content line by line
        lines = summary.splitlines()
        current_paragraph = []
-        
+
        for line in lines:
            line = line.strip()
            if not line:
                if current_paragraph:
-                    p = doc.add_paragraph(' '.join(current_paragraph))
+                    p = doc.add_paragraph(" ".join(current_paragraph))
                    p.alignment = 3  # JUSTIFY alignment (WD_ALIGN_PARAGRAPH.JUSTIFY=3)
                    current_paragraph = []
                continue
-                
-            if line.startswith('#'):
+
+            if line.startswith("#"):
                if current_paragraph:
-                    p = doc.add_paragraph(' '.join(current_paragraph))
+                    p = doc.add_paragraph(" ".join(current_paragraph))
                    p.alignment = 3
                    current_paragraph = []
                # Process heading
-                level = len(line) - len(line.lstrip('#'))
-                heading_text = line.lstrip('#').strip()
+                level = len(line) - len(line.lstrip("#"))
+                heading_text = line.lstrip("#").strip()
                if level <= 6:
                    doc.add_heading(heading_text, level=level)
                else:
                    current_paragraph.append(heading_text)
-            elif line.startswith('-') or line.startswith('*') or line.startswith('•'):
+            elif line.startswith("-") or line.startswith("*") or line.startswith("•"):
                if current_paragraph:
-                    p = doc.add_paragraph(' '.join(current_paragraph))
+                    p = doc.add_paragraph(" ".join(current_paragraph))
                    p.alignment = 3
                    current_paragraph = []
-                bullet_text = line.lstrip('-*• ').strip()
-                p = doc.add_paragraph(bullet_text, style='List Bullet')
+                bullet_text = line.lstrip("-*• ").strip()
+                p = doc.add_paragraph(bullet_text, style="List Bullet")
                # Remove bold markers from bullets if present
-                if '**' in bullet_text:
+                if "**" in bullet_text:
                    # Basic cleanup for bullets
-                    pass 
+                    pass
            else:
                # Clean up excessive bold markers in body text if user requested
-                clean_line = line.replace('**', '') # Removing asterisks as per user complaint "se abusa de los asteriscos"
+                clean_line = line.replace(
+                    "**", ""
+                )  # Removing asterisks as per user complaint "se abusa de los asteriscos"
                current_paragraph.append(clean_line)
-        
+
        if current_paragraph:
-            p = doc.add_paragraph(' '.join(current_paragraph))
+            p = doc.add_paragraph(" ".join(current_paragraph))
            p.alignment = 3

        doc.add_page_break()
@@ -258,18 +308,20 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
        def new_page():
            nonlocal y_position
            c.showPage()
-            c.setFont('Helvetica', 11)
+            c.setFont("Helvetica", 11)
            y_position = height - margin

-        c.setFont('Helvetica', 11)
+        c.setFont("Helvetica", 11)

        # Title
-        c.setFont('Helvetica-Bold', 16)
-        c.drawString(margin, y_position, base_name.replace('_', ' ').title()[:100])
+        c.setFont("Helvetica-Bold", 16)
+        c.drawString(margin, y_position, base_name.replace("_", " ").title()[:100])
        y_position -= 28
-        c.setFont('Helvetica', 11)
+        c.setFont("Helvetica", 11)

-        summary_clean = summary.replace('**', '') # Remove asterisks globally for cleaner PDF
+        summary_clean = summary.replace(
+            "**", ""
+        )  # Remove asterisks globally for cleaner PDF

        for raw_line in summary_clean.splitlines():
            line = raw_line.rstrip()
@@ -282,24 +334,24 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""

            stripped = line.lstrip()

-            if stripped.startswith('#'):
-                level = len(stripped) - len(stripped.lstrip('#'))
-                heading_text = stripped.lstrip('#').strip()
+            if stripped.startswith("#"):
+                level = len(stripped) - len(stripped.lstrip("#"))
+                heading_text = stripped.lstrip("#").strip()
                if heading_text:
                    font_size = 16 if level == 1 else 14 if level == 2 else 12
-                    c.setFont('Helvetica-Bold', font_size)
+                    c.setFont("Helvetica-Bold", font_size)
                    c.drawString(margin, y_position, heading_text[:90])
                    y_position -= font_size + 6
                    if y_position < margin:
                        new_page()
-                    c.setFont('Helvetica', 11)
+                    c.setFont("Helvetica", 11)
                continue

-            if stripped.startswith(('-', '*', '•')):
-                bullet_text = stripped.lstrip('-*•').strip()
-                wrapped_lines = textwrap.wrap(bullet_text, width=80) or ['']
+            if stripped.startswith(("-", "*", "•")):
+                bullet_text = stripped.lstrip("-*•").strip()
+                wrapped_lines = textwrap.wrap(bullet_text, width=80) or [""]
                for idx, wrapped in enumerate(wrapped_lines):
-                    prefix = '• ' if idx == 0 else '  '
+                    prefix = "• " if idx == 0 else "  "
                    c.drawString(margin, y_position, f"{prefix}{wrapped}")
                    y_position -= 14
                    if y_position < margin:
@@ -307,7 +359,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
                continue

            # Body text - Justified approximation (ReportLab native justification requires Paragraph styles, defaulting to wrap)
-            wrapped_lines = textwrap.wrap(stripped, width=90) or ['']
+            wrapped_lines = textwrap.wrap(stripped, width=90) or [""]
            for wrapped in wrapped_lines:
                c.drawString(margin, y_position, wrapped)
                y_position -= 14