Files
cbc2027/document/generators.py
renato97 6058dc642e feat: Integración automática con Notion + análisis completo del código
- Instalado notion-client SDK oficial para integración robusta
- Refactorizado services/notion_service.py con SDK oficial de Notion
  - Rate limiting con retry y exponential backoff
  - Parser Markdown → Notion blocks (headings, bullets, paragraphs)
  - Soporte para pages y databases
  - Manejo robusto de errores

- Integración automática en document/generators.py
  - PDFs se suben automáticamente a Notion después de generarse
  - Contenido completo del resumen formateado con bloques
  - Metadata rica (tipo de archivo, path, fecha)

- Configuración de Notion en main.py
  - Inicialización automática al arrancar el servicio
  - Validación de credenciales

- Actualizado config/settings.py
  - Agregado load_dotenv() para cargar variables de .env
  - Configuración de Notion (NOTION_API, NOTION_DATABASE_ID)

- Scripts de utilidad creados:
  - test_notion_integration.py: Test de subida a Notion
  - test_pipeline_notion.py: Test del pipeline completo
  - verify_notion_permissions.py: Verificación de permisos
  - list_notion_pages.py: Listar páginas accesibles
  - diagnose_notion.py: Diagnóstico completo
  - create_notion_database.py: Crear database automáticamente
  - restart_service.sh: Script de reinicio del servicio

- Documentación completa en opus.md:
  - Análisis exhaustivo del codebase (42 archivos Python)
  - Bugs críticos identificados y soluciones
  - Mejoras de seguridad (autenticación, rate limiting, CORS, CSP)
  - Optimizaciones de rendimiento (Celery, Redis, PostgreSQL, WebSockets)
  - Plan de testing (estructura, ejemplos, 80% coverage goal)
  - Roadmap de implementación (6 sprints detallados)
  - Integración avanzada con Notion documentada

Estado: Notion funcionando correctamente, PDFs se suben automáticamente
2026-01-26 17:31:17 +00:00

371 lines
14 KiB
Python

"""
Document generation utilities
"""
import logging
import re
from pathlib import Path
from typing import Dict, Any, List, Tuple
from core import FileProcessingError
from config import settings
from services.ai import ai_provider_factory
class DocumentGenerator:
"""Generate documents from processed text"""
def __init__(self):
self.logger = logging.getLogger(__name__)
self.ai_provider = ai_provider_factory.get_best_provider()
def generate_summary(
self, text: str, base_name: str
) -> Tuple[bool, str, Dict[str, Any]]:
"""Generate unified summary"""
self.logger.info(f"Generating summary for {base_name}")
try:
# Step 1: Generate Bullet Points (Chunking handled by provider or single prompt for now)
# Note: We use the main provider (Claude/Zai) for content generation
self.logger.info("Generating bullet points...")
bullet_prompt = f"""Analiza el siguiente texto y extrae entre 5 y 8 bullet points clave en español.
REGLAS ESTRICTAS:
1. Devuelve ÚNICAMENTE bullet points, cada línea iniciando con "- "
2. Cada bullet debe ser conciso (12-20 palabras) y resaltar datos, fechas, conceptos o conclusiones importantes
3. NO agregues introducciones, conclusiones ni texto explicativo
4. Concéntrate en los puntos más importantes del texto
5. Incluye fechas, datos específicos y nombres relevantes si los hay
Texto:
{text[:15000]}""" # Truncate to avoid context limits if necessary, though providers handle it differently
try:
bullet_points = self.ai_provider.generate_text(bullet_prompt)
self.logger.info(f"Bullet points generated: {len(bullet_points)}")
except Exception as e:
self.logger.warning(f"Bullet point generation failed: {e}")
bullet_points = "- Puntos clave no disponibles por error en IA"
# Step 2: Generate Unified Summary
self.logger.info("Generating unified summary...")
summary_prompt = f"""Eres un profesor universitario experto en historia del siglo XX. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.
REQUISITOS ESTRICTOS:
- Extensión entre 500-700 palabras
- Usa encabezados Markdown con jerarquía clara (##, ###)
- Desarrolla los puntos clave con profundidad y contexto histórico
- Mantén un tono académico y analítico
- Incluye conclusiones significativas
- NO agregues texto fuera del resumen
- Devuelve únicamente el resumen en formato Markdown
Contenido a resumir:
{text[:20000]}
Puntos clave a incluir obligatoriamente:
{bullet_points}"""
try:
raw_summary = self.ai_provider.generate_text(summary_prompt)
except Exception as e:
self.logger.error(f"Raw summary generation failed: {e}")
raise e
# Step 3: Format with Gemini (using GeminiProvider explicitly)
self.logger.info("Formatting summary with Gemini...")
format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible:
{raw_summary}
Instrucciones:
- Corrige cualquier error de formato
- Asegúrate de que los encabezados estén bien espaciados
- Verifica que las viñetas usen "- " correctamente
- Mantén exactamente el contenido existente
- EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave
- Devuelve únicamente el resumen formateado sin texto adicional"""
# Use generic Gemini provider for formatting as requested
from services.ai.gemini_provider import GeminiProvider
formatter = GeminiProvider()
try:
if formatter.is_available():
summary = formatter.generate_text(format_prompt)
else:
self.logger.warning(
"Gemini formatter not available, using raw summary"
)
summary = raw_summary
except Exception as e:
self.logger.warning(f"Formatting failed ({e}), using raw summary")
summary = raw_summary
# Generate filename
filename = self._generate_filename(text, summary)
# Create document
markdown_path = self._create_markdown(summary, base_name)
docx_path = self._create_docx(summary, base_name)
pdf_path = self._create_pdf(summary, base_name)
# Upload to Notion if configured
from services.notion_service import notion_service
notion_uploaded = False
notion_page_id = None
if settings.has_notion_config:
try:
title = base_name.replace("_", " ").title()
# Crear página con el contenido completo del resumen
notion_metadata = {
"file_type": "Audio", # O 'PDF' dependiendo del origen
"pdf_path": pdf_path,
"add_status": False, # No usar Status/Tipo (no existen en la DB)
"use_as_page": False, # Usar como database, no página
}
notion_page_id = notion_service.create_page_with_summary(
title=title, summary=summary, metadata=notion_metadata
)
if notion_page_id:
notion_uploaded = True
self.logger.info(
f"✅ Resumen subido a Notion: {title} (ID: {notion_page_id})"
)
else:
self.logger.warning(f"⚠️ No se pudo subir a Notion: {title}")
except Exception as e:
self.logger.warning(f"❌ Error al subir a Notion: {e}")
import traceback
traceback.print_exc()
else:
self.logger.info("Notion not configured - skipping upload")
metadata = {
"markdown_path": str(markdown_path),
"docx_path": str(docx_path),
"pdf_path": str(pdf_path),
"docx_name": Path(docx_path).name,
"summary": summary,
"filename": filename,
"notion_uploaded": notion_uploaded,
"notion_page_id": notion_page_id,
}
return True, summary, metadata
except Exception as e:
self.logger.error(f"Document generation process failed: {e}")
return False, "", {}
def _generate_filename(self, text: str, summary: str) -> str:
"""Generate intelligent filename"""
try:
# Use AI to extract key topics
prompt = f"""Extract 2-3 key topics from this summary to create a filename.
Summary: {summary}
Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
topics_text = (
self.ai_provider.sanitize_input(prompt)
if hasattr(self.ai_provider, "sanitize_input")
else summary[:100]
)
# Simple topic extraction
topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3]
if not topics:
topics = ["documento"]
# Limit topic length
topics = [t[: settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
filename = "_".join(topics)[: settings.MAX_FILENAME_LENGTH]
return filename
except Exception as e:
self.logger.error(f"Filename generation failed: {e}")
return base_name[: settings.MAX_FILENAME_BASE_LENGTH]
def _create_markdown(self, summary: str, base_name: str) -> Path:
"""Create Markdown document"""
output_dir = settings.LOCAL_DOWNLOADS_PATH
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.md"
content = f"""# {base_name.replace("_", " ").title()}
## Resumen
{summary}
---
*Generado por CBCFacil*
"""
with open(output_path, "w", encoding="utf-8") as f:
f.write(content)
return output_path
def _create_docx(self, summary: str, base_name: str) -> Path:
"""Create DOCX document with Markdown parsing (Legacy method ported)"""
try:
from docx import Document
from docx.shared import Inches
except ImportError:
raise FileProcessingError("python-docx not installed")
output_dir = settings.LOCAL_DOCX
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.docx"
doc = Document()
doc.add_heading(base_name.replace("_", " ").title(), 0)
# Parse and render Markdown content line by line
lines = summary.splitlines()
current_paragraph = []
for line in lines:
line = line.strip()
if not line:
if current_paragraph:
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3 # JUSTIFY alignment (WD_ALIGN_PARAGRAPH.JUSTIFY=3)
current_paragraph = []
continue
if line.startswith("#"):
if current_paragraph:
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
current_paragraph = []
# Process heading
level = len(line) - len(line.lstrip("#"))
heading_text = line.lstrip("#").strip()
if level <= 6:
doc.add_heading(heading_text, level=level)
else:
current_paragraph.append(heading_text)
elif line.startswith("-") or line.startswith("*") or line.startswith(""):
if current_paragraph:
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
current_paragraph = []
bullet_text = line.lstrip("-*• ").strip()
p = doc.add_paragraph(bullet_text, style="List Bullet")
# Remove bold markers from bullets if present
if "**" in bullet_text:
# Basic cleanup for bullets
pass
else:
# Clean up excessive bold markers in body text if user requested
clean_line = line.replace(
"**", ""
) # Removing asterisks as per user complaint "se abusa de los asteriscos"
current_paragraph.append(clean_line)
if current_paragraph:
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
doc.add_page_break()
doc.add_paragraph(f"*Generado por CBCFacil*")
doc.save(output_path)
return output_path
def _create_pdf(self, summary: str, base_name: str) -> Path:
"""Create PDF document with Markdown parsing (Legacy method ported)"""
try:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import textwrap
except ImportError:
raise FileProcessingError("reportlab not installed")
output_dir = settings.LOCAL_DOWNLOADS_PATH
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.pdf"
c = canvas.Canvas(str(output_path), pagesize=letter)
width, height = letter
margin = 72
y_position = height - margin
def new_page():
nonlocal y_position
c.showPage()
c.setFont("Helvetica", 11)
y_position = height - margin
c.setFont("Helvetica", 11)
# Title
c.setFont("Helvetica-Bold", 16)
c.drawString(margin, y_position, base_name.replace("_", " ").title()[:100])
y_position -= 28
c.setFont("Helvetica", 11)
summary_clean = summary.replace(
"**", ""
) # Remove asterisks globally for cleaner PDF
for raw_line in summary_clean.splitlines():
line = raw_line.rstrip()
if not line.strip():
y_position -= 14
if y_position < margin:
new_page()
continue
stripped = line.lstrip()
if stripped.startswith("#"):
level = len(stripped) - len(stripped.lstrip("#"))
heading_text = stripped.lstrip("#").strip()
if heading_text:
font_size = 16 if level == 1 else 14 if level == 2 else 12
c.setFont("Helvetica-Bold", font_size)
c.drawString(margin, y_position, heading_text[:90])
y_position -= font_size + 6
if y_position < margin:
new_page()
c.setFont("Helvetica", 11)
continue
if stripped.startswith(("-", "*", "")):
bullet_text = stripped.lstrip("-*•").strip()
wrapped_lines = textwrap.wrap(bullet_text, width=80) or [""]
for idx, wrapped in enumerate(wrapped_lines):
prefix = "" if idx == 0 else " "
c.drawString(margin, y_position, f"{prefix}{wrapped}")
y_position -= 14
if y_position < margin:
new_page()
continue
# Body text - Justified approximation (ReportLab native justification requires Paragraph styles, defaulting to wrap)
wrapped_lines = textwrap.wrap(stripped, width=90) or [""]
for wrapped in wrapped_lines:
c.drawString(margin, y_position, wrapped)
y_position -= 14
if y_position < margin:
new_page()
c.save()
return output_path