feat: Integración automática con Notion + análisis completo del código

- Instalado notion-client SDK oficial para integración robusta
- Refactorizado services/notion_service.py con SDK oficial de Notion
  - Rate limiting con retry y exponential backoff
  - Parser Markdown → Notion blocks (headings, bullets, paragraphs)
  - Soporte para pages y databases
  - Manejo robusto de errores

- Integración automática en document/generators.py
  - PDFs se suben automáticamente a Notion después de generarse
  - Contenido completo del resumen formateado con bloques
  - Metadata rica (tipo de archivo, path, fecha)

- Configuración de Notion en main.py
  - Inicialización automática al arrancar el servicio
  - Validación de credenciales

- Actualizado config/settings.py
  - Agregado load_dotenv() para cargar variables de .env
  - Configuración de Notion (NOTION_API, NOTION_DATABASE_ID)

- Scripts de utilidad creados:
  - test_notion_integration.py: Test de subida a Notion
  - test_pipeline_notion.py: Test del pipeline completo
  - verify_notion_permissions.py: Verificación de permisos
  - list_notion_pages.py: Listar páginas accesibles
  - diagnose_notion.py: Diagnóstico completo
  - create_notion_database.py: Crear database automáticamente
  - restart_service.sh: Script de reinicio del servicio

- Documentación completa en opus.md:
  - Análisis exhaustivo del codebase (42 archivos Python)
  - Bugs críticos identificados y soluciones
  - Mejoras de seguridad (autenticación, rate limiting, CORS, CSP)
  - Optimizaciones de rendimiento (Celery, Redis, PostgreSQL, WebSockets)
  - Plan de testing (estructura, ejemplos, 80% coverage goal)
  - Roadmap de implementación (6 sprints detallados)
  - Integración avanzada con Notion documentada

Estado: Notion funcionando correctamente, PDFs se suben automáticamente
This commit is contained in:
renato97
2026-01-26 17:26:50 +00:00
parent 47896fd50a
commit fb107cbe52
12 changed files with 3863 additions and 184 deletions

View File

@@ -1,6 +1,7 @@
"""
Document generation utilities
"""
import logging
import re
from pathlib import Path
@@ -17,7 +18,9 @@ class DocumentGenerator:
self.logger = logging.getLogger(__name__)
self.ai_provider = ai_provider_factory.get_best_provider()
def generate_summary(self, text: str, base_name: str) -> Tuple[bool, str, Dict[str, Any]]:
def generate_summary(
self, text: str, base_name: str
) -> Tuple[bool, str, Dict[str, Any]]:
"""Generate unified summary"""
self.logger.info(f"Generating summary for {base_name}")
@@ -36,7 +39,7 @@ REGLAS ESTRICTAS:
Texto:
{text[:15000]}""" # Truncate to avoid context limits if necessary, though providers handle it differently
try:
bullet_points = self.ai_provider.generate_text(bullet_prompt)
self.logger.info(f"Bullet points generated: {len(bullet_points)}")
@@ -85,13 +88,16 @@ Instrucciones:
# Use generic Gemini provider for formatting as requested
from services.ai.gemini_provider import GeminiProvider
formatter = GeminiProvider()
try:
if formatter.is_available():
summary = formatter.generate_text(format_prompt)
else:
self.logger.warning("Gemini formatter not available, using raw summary")
self.logger.warning(
"Gemini formatter not available, using raw summary"
)
summary = raw_summary
except Exception as e:
self.logger.warning(f"Formatting failed ({e}), using raw summary")
@@ -105,13 +111,51 @@ Instrucciones:
docx_path = self._create_docx(summary, base_name)
pdf_path = self._create_pdf(summary, base_name)
# Upload to Notion if configured
from services.notion_service import notion_service
notion_uploaded = False
notion_page_id = None
if settings.has_notion_config:
try:
title = base_name.replace("_", " ").title()
# Crear página con el contenido completo del resumen
notion_metadata = {
"file_type": "Audio", # O 'PDF' dependiendo del origen
"pdf_path": pdf_path,
"add_status": False, # No usar Status/Tipo (no existen en la DB)
"use_as_page": False, # Usar como database, no página
}
notion_page_id = notion_service.create_page_with_summary(
title=title, summary=summary, metadata=notion_metadata
)
if notion_page_id:
notion_uploaded = True
self.logger.info(
f"✅ Resumen subido a Notion: {title} (ID: {notion_page_id})"
)
else:
self.logger.warning(f"⚠️ No se pudo subir a Notion: {title}")
except Exception as e:
self.logger.warning(f"❌ Error al subir a Notion: {e}")
import traceback
traceback.print_exc()
else:
self.logger.info("Notion not configured - skipping upload")
metadata = {
'markdown_path': str(markdown_path),
'docx_path': str(docx_path),
'pdf_path': str(pdf_path),
'docx_name': Path(docx_path).name,
'summary': summary,
'filename': filename
"markdown_path": str(markdown_path),
"docx_path": str(docx_path),
"pdf_path": str(pdf_path),
"docx_name": Path(docx_path).name,
"summary": summary,
"filename": filename,
"notion_uploaded": notion_uploaded,
"notion_page_id": notion_page_id,
}
return True, summary, metadata
@@ -129,22 +173,26 @@ Summary: {summary}
Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
topics_text = self.ai_provider.sanitize_input(prompt) if hasattr(self.ai_provider, 'sanitize_input') else summary[:100]
topics_text = (
self.ai_provider.sanitize_input(prompt)
if hasattr(self.ai_provider, "sanitize_input")
else summary[:100]
)
# Simple topic extraction
topics = re.findall(r'\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b', topics_text)[:3]
topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3]
if not topics:
topics = ['documento']
topics = ["documento"]
# Limit topic length
topics = [t[:settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
topics = [t[: settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
filename = '_'.join(topics)[:settings.MAX_FILENAME_LENGTH]
filename = "_".join(topics)[: settings.MAX_FILENAME_LENGTH]
return filename
except Exception as e:
self.logger.error(f"Filename generation failed: {e}")
return base_name[:settings.MAX_FILENAME_BASE_LENGTH]
return base_name[: settings.MAX_FILENAME_BASE_LENGTH]
def _create_markdown(self, summary: str, base_name: str) -> Path:
"""Create Markdown document"""
@@ -153,7 +201,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
output_path = output_dir / f"{base_name}_unificado.md"
content = f"""# {base_name.replace('_', ' ').title()}
content = f"""# {base_name.replace("_", " ").title()}
## Resumen
@@ -164,7 +212,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
*Generado por CBCFacil*
"""
with open(output_path, 'w', encoding='utf-8') as f:
with open(output_path, "w", encoding="utf-8") as f:
f.write(content)
return output_path
@@ -183,51 +231,53 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
output_path = output_dir / f"{base_name}_unificado.docx"
doc = Document()
doc.add_heading(base_name.replace('_', ' ').title(), 0)
doc.add_heading(base_name.replace("_", " ").title(), 0)
# Parse and render Markdown content line by line
lines = summary.splitlines()
current_paragraph = []
for line in lines:
line = line.strip()
if not line:
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3 # JUSTIFY alignment (WD_ALIGN_PARAGRAPH.JUSTIFY=3)
current_paragraph = []
continue
if line.startswith('#'):
if line.startswith("#"):
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
current_paragraph = []
# Process heading
level = len(line) - len(line.lstrip('#'))
heading_text = line.lstrip('#').strip()
level = len(line) - len(line.lstrip("#"))
heading_text = line.lstrip("#").strip()
if level <= 6:
doc.add_heading(heading_text, level=level)
else:
current_paragraph.append(heading_text)
elif line.startswith('-') or line.startswith('*') or line.startswith(''):
elif line.startswith("-") or line.startswith("*") or line.startswith(""):
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
current_paragraph = []
bullet_text = line.lstrip('-*• ').strip()
p = doc.add_paragraph(bullet_text, style='List Bullet')
bullet_text = line.lstrip("-*• ").strip()
p = doc.add_paragraph(bullet_text, style="List Bullet")
# Remove bold markers from bullets if present
if '**' in bullet_text:
if "**" in bullet_text:
# Basic cleanup for bullets
pass
pass
else:
# Clean up excessive bold markers in body text if user requested
clean_line = line.replace('**', '') # Removing asterisks as per user complaint "se abusa de los asteriscos"
clean_line = line.replace(
"**", ""
) # Removing asterisks as per user complaint "se abusa de los asteriscos"
current_paragraph.append(clean_line)
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
doc.add_page_break()
@@ -258,18 +308,20 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
def new_page():
nonlocal y_position
c.showPage()
c.setFont('Helvetica', 11)
c.setFont("Helvetica", 11)
y_position = height - margin
c.setFont('Helvetica', 11)
c.setFont("Helvetica", 11)
# Title
c.setFont('Helvetica-Bold', 16)
c.drawString(margin, y_position, base_name.replace('_', ' ').title()[:100])
c.setFont("Helvetica-Bold", 16)
c.drawString(margin, y_position, base_name.replace("_", " ").title()[:100])
y_position -= 28
c.setFont('Helvetica', 11)
c.setFont("Helvetica", 11)
summary_clean = summary.replace('**', '') # Remove asterisks globally for cleaner PDF
summary_clean = summary.replace(
"**", ""
) # Remove asterisks globally for cleaner PDF
for raw_line in summary_clean.splitlines():
line = raw_line.rstrip()
@@ -282,24 +334,24 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
stripped = line.lstrip()
if stripped.startswith('#'):
level = len(stripped) - len(stripped.lstrip('#'))
heading_text = stripped.lstrip('#').strip()
if stripped.startswith("#"):
level = len(stripped) - len(stripped.lstrip("#"))
heading_text = stripped.lstrip("#").strip()
if heading_text:
font_size = 16 if level == 1 else 14 if level == 2 else 12
c.setFont('Helvetica-Bold', font_size)
c.setFont("Helvetica-Bold", font_size)
c.drawString(margin, y_position, heading_text[:90])
y_position -= font_size + 6
if y_position < margin:
new_page()
c.setFont('Helvetica', 11)
c.setFont("Helvetica", 11)
continue
if stripped.startswith(('-', '*', '')):
bullet_text = stripped.lstrip('-*•').strip()
wrapped_lines = textwrap.wrap(bullet_text, width=80) or ['']
if stripped.startswith(("-", "*", "")):
bullet_text = stripped.lstrip("-*•").strip()
wrapped_lines = textwrap.wrap(bullet_text, width=80) or [""]
for idx, wrapped in enumerate(wrapped_lines):
prefix = '' if idx == 0 else ' '
prefix = "" if idx == 0 else " "
c.drawString(margin, y_position, f"{prefix}{wrapped}")
y_position -= 14
if y_position < margin:
@@ -307,7 +359,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
continue
# Body text - Justified approximation (ReportLab native justification requires Paragraph styles, defaulting to wrap)
wrapped_lines = textwrap.wrap(stripped, width=90) or ['']
wrapped_lines = textwrap.wrap(stripped, width=90) or [""]
for wrapped in wrapped_lines:
c.drawString(margin, y_position, wrapped)
y_position -= 14