- Instalado notion-client SDK oficial para integración robusta - Refactorizado services/notion_service.py con SDK oficial de Notion - Rate limiting con retry y exponential backoff - Parser Markdown → Notion blocks (headings, bullets, paragraphs) - Soporte para pages y databases - Manejo robusto de errores - Integración automática en document/generators.py - PDFs se suben automáticamente a Notion después de generarse - Contenido completo del resumen formateado con bloques - Metadata rica (tipo de archivo, path, fecha) - Configuración de Notion en main.py - Inicialización automática al arrancar el servicio - Validación de credenciales - Actualizado config/settings.py - Agregado load_dotenv() para cargar variables de .env - Configuración de Notion (NOTION_API, NOTION_DATABASE_ID) - Scripts de utilidad creados: - test_notion_integration.py: Test de subida a Notion - test_pipeline_notion.py: Test del pipeline completo - verify_notion_permissions.py: Verificación de permisos - list_notion_pages.py: Listar páginas accesibles - diagnose_notion.py: Diagnóstico completo - create_notion_database.py: Crear database automáticamente - restart_service.sh: Script de reinicio del servicio - Documentación completa en opus.md: - Análisis exhaustivo del codebase (42 archivos Python) - Bugs críticos identificados y soluciones - Mejoras de seguridad (autenticación, rate limiting, CORS, CSP) - Optimizaciones de rendimiento (Celery, Redis, PostgreSQL, WebSockets) - Plan de testing (estructura, ejemplos, 80% coverage goal) - Roadmap de implementación (6 sprints detallados) - Integración avanzada con Notion documentada Estado: Notion funcionando correctamente, PDFs se suben automáticamente
371 lines
14 KiB
Python
371 lines
14 KiB
Python
"""
|
|
Document generation utilities
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, Any, List, Tuple
|
|
from core import FileProcessingError
|
|
from config import settings
|
|
from services.ai import ai_provider_factory
|
|
|
|
|
|
class DocumentGenerator:
|
|
"""Generate documents from processed text"""
|
|
|
|
def __init__(self):
|
|
self.logger = logging.getLogger(__name__)
|
|
self.ai_provider = ai_provider_factory.get_best_provider()
|
|
|
|
def generate_summary(
|
|
self, text: str, base_name: str
|
|
) -> Tuple[bool, str, Dict[str, Any]]:
|
|
"""Generate unified summary"""
|
|
self.logger.info(f"Generating summary for {base_name}")
|
|
|
|
try:
|
|
# Step 1: Generate Bullet Points (Chunking handled by provider or single prompt for now)
|
|
# Note: We use the main provider (Claude/Zai) for content generation
|
|
self.logger.info("Generating bullet points...")
|
|
bullet_prompt = f"""Analiza el siguiente texto y extrae entre 5 y 8 bullet points clave en español.
|
|
|
|
REGLAS ESTRICTAS:
|
|
1. Devuelve ÚNICAMENTE bullet points, cada línea iniciando con "- "
|
|
2. Cada bullet debe ser conciso (12-20 palabras) y resaltar datos, fechas, conceptos o conclusiones importantes
|
|
3. NO agregues introducciones, conclusiones ni texto explicativo
|
|
4. Concéntrate en los puntos más importantes del texto
|
|
5. Incluye fechas, datos específicos y nombres relevantes si los hay
|
|
|
|
Texto:
|
|
{text[:15000]}""" # Truncate to avoid context limits if necessary, though providers handle it differently
|
|
|
|
try:
|
|
bullet_points = self.ai_provider.generate_text(bullet_prompt)
|
|
self.logger.info(f"Bullet points generated: {len(bullet_points)}")
|
|
except Exception as e:
|
|
self.logger.warning(f"Bullet point generation failed: {e}")
|
|
bullet_points = "- Puntos clave no disponibles por error en IA"
|
|
|
|
# Step 2: Generate Unified Summary
|
|
self.logger.info("Generating unified summary...")
|
|
summary_prompt = f"""Eres un profesor universitario experto en historia del siglo XX. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.
|
|
|
|
REQUISITOS ESTRICTOS:
|
|
- Extensión entre 500-700 palabras
|
|
- Usa encabezados Markdown con jerarquía clara (##, ###)
|
|
- Desarrolla los puntos clave con profundidad y contexto histórico
|
|
- Mantén un tono académico y analítico
|
|
- Incluye conclusiones significativas
|
|
- NO agregues texto fuera del resumen
|
|
- Devuelve únicamente el resumen en formato Markdown
|
|
|
|
Contenido a resumir:
|
|
{text[:20000]}
|
|
|
|
Puntos clave a incluir obligatoriamente:
|
|
{bullet_points}"""
|
|
|
|
try:
|
|
raw_summary = self.ai_provider.generate_text(summary_prompt)
|
|
except Exception as e:
|
|
self.logger.error(f"Raw summary generation failed: {e}")
|
|
raise e
|
|
|
|
# Step 3: Format with Gemini (using GeminiProvider explicitly)
|
|
self.logger.info("Formatting summary with Gemini...")
|
|
format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible:
|
|
|
|
{raw_summary}
|
|
|
|
Instrucciones:
|
|
- Corrige cualquier error de formato
|
|
- Asegúrate de que los encabezados estén bien espaciados
|
|
- Verifica que las viñetas usen "- " correctamente
|
|
- Mantén exactamente el contenido existente
|
|
- EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave
|
|
- Devuelve únicamente el resumen formateado sin texto adicional"""
|
|
|
|
# Use generic Gemini provider for formatting as requested
|
|
from services.ai.gemini_provider import GeminiProvider
|
|
|
|
formatter = GeminiProvider()
|
|
|
|
try:
|
|
if formatter.is_available():
|
|
summary = formatter.generate_text(format_prompt)
|
|
else:
|
|
self.logger.warning(
|
|
"Gemini formatter not available, using raw summary"
|
|
)
|
|
summary = raw_summary
|
|
except Exception as e:
|
|
self.logger.warning(f"Formatting failed ({e}), using raw summary")
|
|
summary = raw_summary
|
|
|
|
# Generate filename
|
|
filename = self._generate_filename(text, summary)
|
|
|
|
# Create document
|
|
markdown_path = self._create_markdown(summary, base_name)
|
|
docx_path = self._create_docx(summary, base_name)
|
|
pdf_path = self._create_pdf(summary, base_name)
|
|
|
|
# Upload to Notion if configured
|
|
from services.notion_service import notion_service
|
|
|
|
notion_uploaded = False
|
|
notion_page_id = None
|
|
if settings.has_notion_config:
|
|
try:
|
|
title = base_name.replace("_", " ").title()
|
|
|
|
# Crear página con el contenido completo del resumen
|
|
notion_metadata = {
|
|
"file_type": "Audio", # O 'PDF' dependiendo del origen
|
|
"pdf_path": pdf_path,
|
|
"add_status": False, # No usar Status/Tipo (no existen en la DB)
|
|
"use_as_page": False, # Usar como database, no página
|
|
}
|
|
|
|
notion_page_id = notion_service.create_page_with_summary(
|
|
title=title, summary=summary, metadata=notion_metadata
|
|
)
|
|
|
|
if notion_page_id:
|
|
notion_uploaded = True
|
|
self.logger.info(
|
|
f"✅ Resumen subido a Notion: {title} (ID: {notion_page_id})"
|
|
)
|
|
else:
|
|
self.logger.warning(f"⚠️ No se pudo subir a Notion: {title}")
|
|
except Exception as e:
|
|
self.logger.warning(f"❌ Error al subir a Notion: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
else:
|
|
self.logger.info("Notion not configured - skipping upload")
|
|
|
|
metadata = {
|
|
"markdown_path": str(markdown_path),
|
|
"docx_path": str(docx_path),
|
|
"pdf_path": str(pdf_path),
|
|
"docx_name": Path(docx_path).name,
|
|
"summary": summary,
|
|
"filename": filename,
|
|
"notion_uploaded": notion_uploaded,
|
|
"notion_page_id": notion_page_id,
|
|
}
|
|
|
|
return True, summary, metadata
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Document generation process failed: {e}")
|
|
return False, "", {}
|
|
|
|
def _generate_filename(self, text: str, summary: str) -> str:
|
|
"""Generate intelligent filename"""
|
|
try:
|
|
# Use AI to extract key topics
|
|
prompt = f"""Extract 2-3 key topics from this summary to create a filename.
|
|
Summary: {summary}
|
|
|
|
Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
|
|
|
|
topics_text = (
|
|
self.ai_provider.sanitize_input(prompt)
|
|
if hasattr(self.ai_provider, "sanitize_input")
|
|
else summary[:100]
|
|
)
|
|
|
|
# Simple topic extraction
|
|
topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3]
|
|
if not topics:
|
|
topics = ["documento"]
|
|
|
|
# Limit topic length
|
|
topics = [t[: settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
|
|
|
|
filename = "_".join(topics)[: settings.MAX_FILENAME_LENGTH]
|
|
return filename
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Filename generation failed: {e}")
|
|
return base_name[: settings.MAX_FILENAME_BASE_LENGTH]
|
|
|
|
def _create_markdown(self, summary: str, base_name: str) -> Path:
|
|
"""Create Markdown document"""
|
|
output_dir = settings.LOCAL_DOWNLOADS_PATH
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_path = output_dir / f"{base_name}_unificado.md"
|
|
|
|
content = f"""# {base_name.replace("_", " ").title()}
|
|
|
|
## Resumen
|
|
|
|
{summary}
|
|
|
|
---
|
|
|
|
*Generado por CBCFacil*
|
|
"""
|
|
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
|
|
return output_path
|
|
|
|
def _create_docx(self, summary: str, base_name: str) -> Path:
|
|
"""Create DOCX document with Markdown parsing (Legacy method ported)"""
|
|
try:
|
|
from docx import Document
|
|
from docx.shared import Inches
|
|
except ImportError:
|
|
raise FileProcessingError("python-docx not installed")
|
|
|
|
output_dir = settings.LOCAL_DOCX
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_path = output_dir / f"{base_name}_unificado.docx"
|
|
|
|
doc = Document()
|
|
doc.add_heading(base_name.replace("_", " ").title(), 0)
|
|
|
|
# Parse and render Markdown content line by line
|
|
lines = summary.splitlines()
|
|
current_paragraph = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
if current_paragraph:
|
|
p = doc.add_paragraph(" ".join(current_paragraph))
|
|
p.alignment = 3 # JUSTIFY alignment (WD_ALIGN_PARAGRAPH.JUSTIFY=3)
|
|
current_paragraph = []
|
|
continue
|
|
|
|
if line.startswith("#"):
|
|
if current_paragraph:
|
|
p = doc.add_paragraph(" ".join(current_paragraph))
|
|
p.alignment = 3
|
|
current_paragraph = []
|
|
# Process heading
|
|
level = len(line) - len(line.lstrip("#"))
|
|
heading_text = line.lstrip("#").strip()
|
|
if level <= 6:
|
|
doc.add_heading(heading_text, level=level)
|
|
else:
|
|
current_paragraph.append(heading_text)
|
|
elif line.startswith("-") or line.startswith("*") or line.startswith("•"):
|
|
if current_paragraph:
|
|
p = doc.add_paragraph(" ".join(current_paragraph))
|
|
p.alignment = 3
|
|
current_paragraph = []
|
|
bullet_text = line.lstrip("-*• ").strip()
|
|
p = doc.add_paragraph(bullet_text, style="List Bullet")
|
|
# Remove bold markers from bullets if present
|
|
if "**" in bullet_text:
|
|
# Basic cleanup for bullets
|
|
pass
|
|
else:
|
|
# Clean up excessive bold markers in body text if user requested
|
|
clean_line = line.replace(
|
|
"**", ""
|
|
) # Removing asterisks as per user complaint "se abusa de los asteriscos"
|
|
current_paragraph.append(clean_line)
|
|
|
|
if current_paragraph:
|
|
p = doc.add_paragraph(" ".join(current_paragraph))
|
|
p.alignment = 3
|
|
|
|
doc.add_page_break()
|
|
doc.add_paragraph(f"*Generado por CBCFacil*")
|
|
|
|
doc.save(output_path)
|
|
return output_path
|
|
|
|
def _create_pdf(self, summary: str, base_name: str) -> Path:
|
|
"""Create PDF document with Markdown parsing (Legacy method ported)"""
|
|
try:
|
|
from reportlab.lib.pagesizes import letter
|
|
from reportlab.pdfgen import canvas
|
|
import textwrap
|
|
except ImportError:
|
|
raise FileProcessingError("reportlab not installed")
|
|
|
|
output_dir = settings.LOCAL_DOWNLOADS_PATH
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_path = output_dir / f"{base_name}_unificado.pdf"
|
|
|
|
c = canvas.Canvas(str(output_path), pagesize=letter)
|
|
width, height = letter
|
|
margin = 72
|
|
y_position = height - margin
|
|
|
|
def new_page():
|
|
nonlocal y_position
|
|
c.showPage()
|
|
c.setFont("Helvetica", 11)
|
|
y_position = height - margin
|
|
|
|
c.setFont("Helvetica", 11)
|
|
|
|
# Title
|
|
c.setFont("Helvetica-Bold", 16)
|
|
c.drawString(margin, y_position, base_name.replace("_", " ").title()[:100])
|
|
y_position -= 28
|
|
c.setFont("Helvetica", 11)
|
|
|
|
summary_clean = summary.replace(
|
|
"**", ""
|
|
) # Remove asterisks globally for cleaner PDF
|
|
|
|
for raw_line in summary_clean.splitlines():
|
|
line = raw_line.rstrip()
|
|
|
|
if not line.strip():
|
|
y_position -= 14
|
|
if y_position < margin:
|
|
new_page()
|
|
continue
|
|
|
|
stripped = line.lstrip()
|
|
|
|
if stripped.startswith("#"):
|
|
level = len(stripped) - len(stripped.lstrip("#"))
|
|
heading_text = stripped.lstrip("#").strip()
|
|
if heading_text:
|
|
font_size = 16 if level == 1 else 14 if level == 2 else 12
|
|
c.setFont("Helvetica-Bold", font_size)
|
|
c.drawString(margin, y_position, heading_text[:90])
|
|
y_position -= font_size + 6
|
|
if y_position < margin:
|
|
new_page()
|
|
c.setFont("Helvetica", 11)
|
|
continue
|
|
|
|
if stripped.startswith(("-", "*", "•")):
|
|
bullet_text = stripped.lstrip("-*•").strip()
|
|
wrapped_lines = textwrap.wrap(bullet_text, width=80) or [""]
|
|
for idx, wrapped in enumerate(wrapped_lines):
|
|
prefix = "• " if idx == 0 else " "
|
|
c.drawString(margin, y_position, f"{prefix}{wrapped}")
|
|
y_position -= 14
|
|
if y_position < margin:
|
|
new_page()
|
|
continue
|
|
|
|
# Body text - Justified approximation (ReportLab native justification requires Paragraph styles, defaulting to wrap)
|
|
wrapped_lines = textwrap.wrap(stripped, width=90) or [""]
|
|
for wrapped in wrapped_lines:
|
|
c.drawString(margin, y_position, wrapped)
|
|
y_position -= 14
|
|
if y_position < margin:
|
|
new_page()
|
|
|
|
c.save()
|
|
return output_path
|