feat: Integración automática con Notion + análisis completo del código

- Instalado notion-client SDK oficial para integración robusta
- Refactorizado services/notion_service.py con SDK oficial de Notion
  - Rate limiting con retry y exponential backoff
  - Parser Markdown → Notion blocks (headings, bullets, paragraphs)
  - Soporte para pages y databases
  - Manejo robusto de errores

- Integración automática en document/generators.py
  - PDFs se suben automáticamente a Notion después de generarse
  - Contenido completo del resumen formateado con bloques
  - Metadata rica (tipo de archivo, path, fecha)

- Configuración de Notion en main.py
  - Inicialización automática al arrancar el servicio
  - Validación de credenciales

- Actualizado config/settings.py
  - Agregado load_dotenv() para cargar variables de .env
  - Configuración de Notion (NOTION_API, NOTION_DATABASE_ID)

- Scripts de utilidad creados:
  - test_notion_integration.py: Test de subida a Notion
  - test_pipeline_notion.py: Test del pipeline completo
  - verify_notion_permissions.py: Verificación de permisos
  - list_notion_pages.py: Listar páginas accesibles
  - diagnose_notion.py: Diagnóstico completo
  - create_notion_database.py: Crear database automáticamente
  - restart_service.sh: Script de reinicio del servicio

- Documentación completa en opus.md:
  - Análisis exhaustivo del codebase (42 archivos Python)
  - Bugs críticos identificados y soluciones
  - Mejoras de seguridad (autenticación, rate limiting, CORS, CSP)
  - Optimizaciones de rendimiento (Celery, Redis, PostgreSQL, WebSockets)
  - Plan de testing (estructura, ejemplos, 80% coverage goal)
  - Roadmap de implementación (6 sprints detallados)
  - Integración avanzada con Notion documentada

Estado: Notion funcionando correctamente, PDFs se suben automáticamente
This commit is contained in:
renato97
2026-01-26 17:26:50 +00:00
parent 47896fd50a
commit 6058dc642e
12 changed files with 3863 additions and 184 deletions

View File

@@ -40,6 +40,14 @@ GEMINI_CLI_PATH=/path/to/gemini # or leave empty
TELEGRAM_TOKEN=your_telegram_bot_token
TELEGRAM_CHAT_ID=your_telegram_chat_id
# =============================================================================
# Notion Integration (Optional - for automatic PDF uploads)
# =============================================================================
# Get your token from: https://developers.notion.com/docs/create-a-notion-integration
NOTION_API=ntn_YOUR_NOTION_INTEGRATION_TOKEN_HERE
# Get your database ID from the database URL in Notion
NOTION_DATABASE_ID=your_database_id_here
# =============================================================================
# Dashboard Configuration (Required for production)
# =============================================================================

View File

@@ -1,13 +1,19 @@
"""
Centralized configuration management for CBCFacil
"""
import os
from pathlib import Path
from typing import Optional, Set, Union
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
class ConfigurationError(Exception):
"""Raised when configuration is invalid"""
pass
@@ -44,7 +50,9 @@ class Settings:
POLL_INTERVAL: int = int(os.getenv("POLL_INTERVAL", "5"))
HTTP_TIMEOUT: int = int(os.getenv("HTTP_TIMEOUT", "30"))
WEBDAV_MAX_RETRIES: int = int(os.getenv("WEBDAV_MAX_RETRIES", "3"))
DOWNLOAD_CHUNK_SIZE: int = int(os.getenv("DOWNLOAD_CHUNK_SIZE", "65536")) # 64KB for better performance
DOWNLOAD_CHUNK_SIZE: int = int(
os.getenv("DOWNLOAD_CHUNK_SIZE", "65536")
) # 64KB for better performance
MAX_FILENAME_LENGTH: int = int(os.getenv("MAX_FILENAME_LENGTH", "80"))
MAX_FILENAME_BASE_LENGTH: int = int(os.getenv("MAX_FILENAME_BASE_LENGTH", "40"))
MAX_FILENAME_TOPICS_LENGTH: int = int(os.getenv("MAX_FILENAME_TOPICS_LENGTH", "20"))
@@ -57,7 +65,13 @@ class Settings:
# AI Providers
ZAI_BASE_URL: str = os.getenv("ZAI_BASE_URL", "https://api.z.ai/api/anthropic")
ZAI_DEFAULT_MODEL: str = os.getenv("ZAI_MODEL", "glm-4.6")
ZAI_AUTH_TOKEN: Optional[str] = os.getenv("ANTHROPIC_AUTH_TOKEN") or os.getenv("ZAI_AUTH_TOKEN", "")
ZAI_AUTH_TOKEN: Optional[str] = os.getenv("ANTHROPIC_AUTH_TOKEN") or os.getenv(
"ZAI_AUTH_TOKEN", ""
)
# Notion Integration
NOTION_API_TOKEN: Optional[str] = os.getenv("NOTION_API")
NOTION_DATABASE_ID: Optional[str] = os.getenv("NOTION_DATABASE_ID")
# Gemini
GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
@@ -76,13 +90,25 @@ class Settings:
CPU_COUNT: int = os.cpu_count() or 1
PDF_MAX_PAGES_PER_CHUNK: int = int(os.getenv("PDF_MAX_PAGES_PER_CHUNK", "2"))
PDF_DPI: int = int(os.getenv("PDF_DPI", "200"))
PDF_RENDER_THREAD_COUNT: int = int(os.getenv("PDF_RENDER_THREAD_COUNT", str(min(4, CPU_COUNT))))
PDF_RENDER_THREAD_COUNT: int = int(
os.getenv("PDF_RENDER_THREAD_COUNT", str(min(4, CPU_COUNT)))
)
PDF_BATCH_SIZE: int = int(os.getenv("PDF_BATCH_SIZE", "2"))
PDF_TROCR_MAX_BATCH: int = int(os.getenv("PDF_TROCR_MAX_BATCH", str(PDF_BATCH_SIZE)))
PDF_TESSERACT_THREADS: int = int(os.getenv("PDF_TESSERACT_THREADS", str(max(1, min(2, max(1, CPU_COUNT // 3))))))
PDF_PREPROCESS_THREADS: int = int(os.getenv("PDF_PREPROCESS_THREADS", str(PDF_TESSERACT_THREADS)))
PDF_TEXT_DETECTION_MIN_RATIO: float = float(os.getenv("PDF_TEXT_DETECTION_MIN_RATIO", "0.6"))
PDF_TEXT_DETECTION_MIN_AVG_CHARS: int = int(os.getenv("PDF_TEXT_DETECTION_MIN_AVG_CHARS", "120"))
PDF_TROCR_MAX_BATCH: int = int(
os.getenv("PDF_TROCR_MAX_BATCH", str(PDF_BATCH_SIZE))
)
PDF_TESSERACT_THREADS: int = int(
os.getenv("PDF_TESSERACT_THREADS", str(max(1, min(2, max(1, CPU_COUNT // 3)))))
)
PDF_PREPROCESS_THREADS: int = int(
os.getenv("PDF_PREPROCESS_THREADS", str(PDF_TESSERACT_THREADS))
)
PDF_TEXT_DETECTION_MIN_RATIO: float = float(
os.getenv("PDF_TEXT_DETECTION_MIN_RATIO", "0.6")
)
PDF_TEXT_DETECTION_MIN_AVG_CHARS: int = int(
os.getenv("PDF_TEXT_DETECTION_MIN_AVG_CHARS", "120")
)
# Error handling
ERROR_THROTTLE_SECONDS: int = int(os.getenv("ERROR_THROTTLE_SECONDS", "600"))
@@ -90,7 +116,9 @@ class Settings:
# GPU/VRAM Management
MODEL_TIMEOUT_SECONDS: int = int(os.getenv("MODEL_TIMEOUT_SECONDS", "300"))
CUDA_VISIBLE_DEVICES: str = os.getenv("CUDA_VISIBLE_DEVICES", "all")
PYTORCH_CUDA_ALLOC_CONF: str = os.getenv("PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:512")
PYTORCH_CUDA_ALLOC_CONF: str = os.getenv(
"PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:512"
)
# GPU Detection (auto, nvidia, amd, cpu)
GPU_PREFERENCE: str = os.getenv("GPU_PREFERENCE", "auto")
@@ -127,17 +155,29 @@ class Settings:
@property
def has_ai_config(self) -> bool:
"""Check if AI providers are configured"""
return any([
return any(
[
self.ZAI_AUTH_TOKEN,
self.GEMINI_API_KEY,
self.CLAUDE_CLI_PATH,
self.GEMINI_CLI_PATH
])
self.GEMINI_CLI_PATH,
]
)
@property
def has_notion_config(self) -> bool:
"""Check if Notion is configured"""
return bool(self.NOTION_API_TOKEN and self.NOTION_DATABASE_ID)
@property
def processed_files_path(self) -> Path:
"""Get the path to the processed files registry"""
return Path(os.getenv("PROCESSED_FILES_PATH", str(Path(self.LOCAL_STATE_DIR) / "processed_files.txt")))
return Path(
os.getenv(
"PROCESSED_FILES_PATH",
str(Path(self.LOCAL_STATE_DIR) / "processed_files.txt"),
)
)
@property
def nextcloud_url(self) -> str:
@@ -157,7 +197,9 @@ class Settings:
def nextcloud_password(self) -> str:
"""Get Nextcloud password with validation"""
if not self.NEXTCLOUD_PASSWORD and self.is_production:
raise ConfigurationError("NEXTCLOUD_PASSWORD is required in production mode")
raise ConfigurationError(
"NEXTCLOUD_PASSWORD is required in production mode"
)
return self.NEXTCLOUD_PASSWORD
@property
@@ -181,6 +223,7 @@ class Settings:
"""Check if GPU support is available"""
try:
import torch
return torch.cuda.is_available()
except ImportError:
return False
@@ -203,7 +246,7 @@ class Settings:
"telegram_configured": self.telegram_configured,
"gpu_support": self.has_gpu_support,
"cpu_count": self.CPU_COUNT,
"poll_interval": self.POLL_INTERVAL
"poll_interval": self.POLL_INTERVAL,
}

126
create_notion_database.py Normal file
View File

@@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""
Script para crear una nueva base de datos de Notion y compartirla automáticamente
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from config import settings
from notion_client import Client
def main():
print("\n" + "=" * 70)
print("🛠️ CREAR BASE DE DATOS DE NOTION PARA CBCFACIL")
print("=" * 70 + "\n")
token = settings.NOTION_API_TOKEN
if not token:
print("❌ Token no configurado en .env")
return
client = Client(auth=token)
# Primero, buscar una página donde crear la database
print("🔍 Buscando páginas accesibles...\n")
results = client.search(page_size=100)
pages = [p for p in results.get("results", []) if p.get("object") == "page"]
if not pages:
print("❌ No tienes páginas accesibles.")
print("\n📋 SOLUCIÓN:")
print("1. Ve a Notion y crea una nueva página")
print("2. En esa página, click en 'Share'")
print("3. Busca y agrega tu integración")
print("4. Ejecuta este script nuevamente\n")
return
# Mostrar páginas disponibles
print(f"✅ Encontradas {len(pages)} página(s) accesibles:\n")
for i, page in enumerate(pages[:10], 1):
page_id = page.get("id")
props = page.get("properties", {})
# Intentar obtener el título
title = "Sin título"
for prop_name, prop_data in props.items():
if prop_data.get("type") == "title":
title_list = prop_data.get("title", [])
if title_list:
title = title_list[0].get("plain_text", "Sin título")
break
print(f"{i}. {title[:50]}")
print(f" ID: {page_id}\n")
# Usar la primera página accesible
parent_page = pages[0]
parent_id = parent_page.get("id")
print("=" * 70)
print(f"📄 Voy a crear la base de datos dentro de la primera página")
print("=" * 70 + "\n")
try:
# Crear la base de datos
print("🚀 Creando base de datos 'CBCFacil - Documentos'...\n")
database = client.databases.create(
parent={"page_id": parent_id},
title=[
{
"type": "text",
"text": {"content": "CBCFacil - Documentos Procesados"},
}
],
properties={
"Name": {"title": {}},
"Status": {
"select": {
"options": [
{"name": "Procesado", "color": "green"},
{"name": "En Proceso", "color": "yellow"},
{"name": "Error", "color": "red"},
]
}
},
"Tipo": {
"select": {
"options": [
{"name": "AUDIO", "color": "purple"},
{"name": "PDF", "color": "orange"},
{"name": "TEXTO", "color": "gray"},
]
}
},
"Fecha": {"date": {}},
},
)
db_id = database["id"]
print("✅ ¡Base de datos creada exitosamente!")
print("=" * 70)
print(f"\n📊 Información de la base de datos:\n")
print(f" Nombre: CBCFacil - Documentos Procesados")
print(f" ID: {db_id}")
print(f" URL: https://notion.so/{db_id.replace('-', '')}")
print("\n=" * 70)
print("\n🎯 SIGUIENTE PASO:")
print("=" * 70)
print(f"\nActualiza tu archivo .env con:\n")
print(f"NOTION_DATABASE_ID={db_id}\n")
print("Luego ejecuta:")
print("python test_notion_integration.py\n")
print("=" * 70 + "\n")
except Exception as e:
print(f"❌ Error creando base de datos: {e}")
print("\nVerifica que la integración tenga permisos de escritura.\n")
if __name__ == "__main__":
main()

116
diagnose_notion.py Normal file
View File

@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""
Script para diagnosticar la integración de Notion
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from config import settings
from notion_client import Client
def main():
print("\n" + "=" * 70)
print("🔍 DIAGNÓSTICO COMPLETO DE NOTION")
print("=" * 70 + "\n")
token = settings.NOTION_API_TOKEN
database_id = settings.NOTION_DATABASE_ID
print(f"Token: {token[:30]}..." if token else "❌ Token no configurado")
print(f"Database ID: {database_id}\n")
if not token:
print("❌ Configura NOTION_API en .env\n")
return
client = Client(auth=token)
# Test 1: Verificar que el token sea válido
print("📝 Test 1: Verificando token...")
try:
# Intentar buscar páginas (cualquiera)
results = client.search(query="", page_size=1)
print("✅ Token válido - la integración está activa\n")
# Ver si tiene acceso a alguna página
pages = results.get("results", [])
if pages:
print(f"✅ La integración tiene acceso a {len(pages)} página(s)")
for page in pages[:3]:
page_id = page.get("id", "N/A")
page_type = page.get("object", "N/A")
print(f" - {page_type}: {page_id}")
else:
print("⚠️ La integración NO tiene acceso a ninguna página aún")
print(" Esto es normal si acabas de crear la integración.\n")
except Exception as e:
print(f"❌ Error con el token: {e}\n")
return
# Test 2: Verificar acceso a la base de datos específica
print("\n📊 Test 2: Verificando acceso a la base de datos CBC...")
try:
database = client.databases.retrieve(database_id=database_id)
print("✅ ¡ÉXITO! La integración puede acceder a la base de datos\n")
title = database.get("title", [{}])[0].get("plain_text", "Sin título")
print(f" Título: {title}")
print(f" ID: {database['id']}")
print(f"\n Propiedades:")
for prop_name in database.get("properties", {}).keys():
print(f"{prop_name}")
print("\n" + "=" * 70)
print("✅ TODO CONFIGURADO CORRECTAMENTE")
print("=" * 70)
print("\n🚀 Ejecuta: python test_notion_integration.py\n")
except Exception as e:
error_msg = str(e)
print(f"❌ No se puede acceder a la base de datos")
print(f" Error: {error_msg}\n")
if "Could not find database" in error_msg:
print("=" * 70)
print("⚠️ ACCIÓN REQUERIDA: Compartir la base de datos")
print("=" * 70)
print("\n📋 PASOS DETALLADOS:\n")
print("1. Abre Notion en tu navegador")
print("\n2. Ve a tu base de datos 'CBC'")
print(f" Opción A: Usa este link directo:")
print(f" → https://www.notion.so/{database_id.replace('-', '')}")
print(f"\n Opción B: Busca 'CBC' en tu workspace")
print("\n3. En la página de la base de datos, busca el botón '...' ")
print(" (tres puntos) en la esquina SUPERIOR DERECHA")
print("\n4. En el menú que se abre, busca:")
print("'Connections' (en inglés)")
print("'Conexiones' (en español)")
print("'Connect to' (puede variar)")
print("\n5. Haz click y verás un menú de integraciones")
print("\n6. Busca tu integración en la lista")
print(" (Debería tener el nombre que le pusiste al crearla)")
print("\n7. Haz click en tu integración para activarla")
print("\n8. Confirma los permisos cuando te lo pida")
print("\n9. Deberías ver un mensaje confirmando la conexión")
print("\n10. ¡Listo! Vuelve a ejecutar:")
print(" python verify_notion_permissions.py\n")
print("=" * 70)
# Crear una página de prueba simple para verificar
print("\n💡 ALTERNATIVA: Crear una nueva página de prueba\n")
print("Si no encuentras la opción de conexiones en tu base de datos,")
print("puedes crear una página nueva y compartirla con la integración:\n")
print("1. Crea una nueva página en Notion")
print("2. En esa página, click en 'Share' (Compartir)")
print("3. Busca tu integración y agrégala")
print("4. Luego convierte esa página en una base de datos")
print("5. Usa el ID de esa nueva base de datos\n")
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,7 @@
"""
Document generation utilities
"""
import logging
import re
from pathlib import Path
@@ -17,7 +18,9 @@ class DocumentGenerator:
self.logger = logging.getLogger(__name__)
self.ai_provider = ai_provider_factory.get_best_provider()
def generate_summary(self, text: str, base_name: str) -> Tuple[bool, str, Dict[str, Any]]:
def generate_summary(
self, text: str, base_name: str
) -> Tuple[bool, str, Dict[str, Any]]:
"""Generate unified summary"""
self.logger.info(f"Generating summary for {base_name}")
@@ -85,13 +88,16 @@ Instrucciones:
# Use generic Gemini provider for formatting as requested
from services.ai.gemini_provider import GeminiProvider
formatter = GeminiProvider()
try:
if formatter.is_available():
summary = formatter.generate_text(format_prompt)
else:
self.logger.warning("Gemini formatter not available, using raw summary")
self.logger.warning(
"Gemini formatter not available, using raw summary"
)
summary = raw_summary
except Exception as e:
self.logger.warning(f"Formatting failed ({e}), using raw summary")
@@ -105,13 +111,51 @@ Instrucciones:
docx_path = self._create_docx(summary, base_name)
pdf_path = self._create_pdf(summary, base_name)
# Upload to Notion if configured
from services.notion_service import notion_service
notion_uploaded = False
notion_page_id = None
if settings.has_notion_config:
try:
title = base_name.replace("_", " ").title()
# Crear página con el contenido completo del resumen
notion_metadata = {
"file_type": "Audio", # O 'PDF' dependiendo del origen
"pdf_path": pdf_path,
"add_status": False, # No usar Status/Tipo (no existen en la DB)
"use_as_page": False, # Usar como database, no página
}
notion_page_id = notion_service.create_page_with_summary(
title=title, summary=summary, metadata=notion_metadata
)
if notion_page_id:
notion_uploaded = True
self.logger.info(
f"✅ Resumen subido a Notion: {title} (ID: {notion_page_id})"
)
else:
self.logger.warning(f"⚠️ No se pudo subir a Notion: {title}")
except Exception as e:
self.logger.warning(f"❌ Error al subir a Notion: {e}")
import traceback
traceback.print_exc()
else:
self.logger.info("Notion not configured - skipping upload")
metadata = {
'markdown_path': str(markdown_path),
'docx_path': str(docx_path),
'pdf_path': str(pdf_path),
'docx_name': Path(docx_path).name,
'summary': summary,
'filename': filename
"markdown_path": str(markdown_path),
"docx_path": str(docx_path),
"pdf_path": str(pdf_path),
"docx_name": Path(docx_path).name,
"summary": summary,
"filename": filename,
"notion_uploaded": notion_uploaded,
"notion_page_id": notion_page_id,
}
return True, summary, metadata
@@ -129,17 +173,21 @@ Summary: {summary}
Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
topics_text = self.ai_provider.sanitize_input(prompt) if hasattr(self.ai_provider, 'sanitize_input') else summary[:100]
topics_text = (
self.ai_provider.sanitize_input(prompt)
if hasattr(self.ai_provider, "sanitize_input")
else summary[:100]
)
# Simple topic extraction
topics = re.findall(r'\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b', topics_text)[:3]
topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3]
if not topics:
topics = ['documento']
topics = ["documento"]
# Limit topic length
topics = [t[: settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
filename = '_'.join(topics)[:settings.MAX_FILENAME_LENGTH]
filename = "_".join(topics)[: settings.MAX_FILENAME_LENGTH]
return filename
except Exception as e:
@@ -153,7 +201,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
output_path = output_dir / f"{base_name}_unificado.md"
content = f"""# {base_name.replace('_', ' ').title()}
content = f"""# {base_name.replace("_", " ").title()}
## Resumen
@@ -164,7 +212,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
*Generado por CBCFacil*
"""
with open(output_path, 'w', encoding='utf-8') as f:
with open(output_path, "w", encoding="utf-8") as f:
f.write(content)
return output_path
@@ -183,7 +231,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
output_path = output_dir / f"{base_name}_unificado.docx"
doc = Document()
doc.add_heading(base_name.replace('_', ' ').title(), 0)
doc.add_heading(base_name.replace("_", " ").title(), 0)
# Parse and render Markdown content line by line
lines = summary.splitlines()
@@ -193,41 +241,43 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
line = line.strip()
if not line:
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3 # JUSTIFY alignment (WD_ALIGN_PARAGRAPH.JUSTIFY=3)
current_paragraph = []
continue
if line.startswith('#'):
if line.startswith("#"):
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
current_paragraph = []
# Process heading
level = len(line) - len(line.lstrip('#'))
heading_text = line.lstrip('#').strip()
level = len(line) - len(line.lstrip("#"))
heading_text = line.lstrip("#").strip()
if level <= 6:
doc.add_heading(heading_text, level=level)
else:
current_paragraph.append(heading_text)
elif line.startswith('-') or line.startswith('*') or line.startswith(''):
elif line.startswith("-") or line.startswith("*") or line.startswith(""):
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
current_paragraph = []
bullet_text = line.lstrip('-*• ').strip()
p = doc.add_paragraph(bullet_text, style='List Bullet')
bullet_text = line.lstrip("-*• ").strip()
p = doc.add_paragraph(bullet_text, style="List Bullet")
# Remove bold markers from bullets if present
if '**' in bullet_text:
if "**" in bullet_text:
# Basic cleanup for bullets
pass
else:
# Clean up excessive bold markers in body text if user requested
clean_line = line.replace('**', '') # Removing asterisks as per user complaint "se abusa de los asteriscos"
clean_line = line.replace(
"**", ""
) # Removing asterisks as per user complaint "se abusa de los asteriscos"
current_paragraph.append(clean_line)
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
doc.add_page_break()
@@ -258,18 +308,20 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
def new_page():
nonlocal y_position
c.showPage()
c.setFont('Helvetica', 11)
c.setFont("Helvetica", 11)
y_position = height - margin
c.setFont('Helvetica', 11)
c.setFont("Helvetica", 11)
# Title
c.setFont('Helvetica-Bold', 16)
c.drawString(margin, y_position, base_name.replace('_', ' ').title()[:100])
c.setFont("Helvetica-Bold", 16)
c.drawString(margin, y_position, base_name.replace("_", " ").title()[:100])
y_position -= 28
c.setFont('Helvetica', 11)
c.setFont("Helvetica", 11)
summary_clean = summary.replace('**', '') # Remove asterisks globally for cleaner PDF
summary_clean = summary.replace(
"**", ""
) # Remove asterisks globally for cleaner PDF
for raw_line in summary_clean.splitlines():
line = raw_line.rstrip()
@@ -282,24 +334,24 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
stripped = line.lstrip()
if stripped.startswith('#'):
level = len(stripped) - len(stripped.lstrip('#'))
heading_text = stripped.lstrip('#').strip()
if stripped.startswith("#"):
level = len(stripped) - len(stripped.lstrip("#"))
heading_text = stripped.lstrip("#").strip()
if heading_text:
font_size = 16 if level == 1 else 14 if level == 2 else 12
c.setFont('Helvetica-Bold', font_size)
c.setFont("Helvetica-Bold", font_size)
c.drawString(margin, y_position, heading_text[:90])
y_position -= font_size + 6
if y_position < margin:
new_page()
c.setFont('Helvetica', 11)
c.setFont("Helvetica", 11)
continue
if stripped.startswith(('-', '*', '')):
bullet_text = stripped.lstrip('-*•').strip()
wrapped_lines = textwrap.wrap(bullet_text, width=80) or ['']
if stripped.startswith(("-", "*", "")):
bullet_text = stripped.lstrip("-*•").strip()
wrapped_lines = textwrap.wrap(bullet_text, width=80) or [""]
for idx, wrapped in enumerate(wrapped_lines):
prefix = '' if idx == 0 else ' '
prefix = "" if idx == 0 else " "
c.drawString(margin, y_position, f"{prefix}{wrapped}")
y_position -= 14
if y_position < margin:
@@ -307,7 +359,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
continue
# Body text - Justified approximation (ReportLab native justification requires Paragraph styles, defaulting to wrap)
wrapped_lines = textwrap.wrap(stripped, width=90) or ['']
wrapped_lines = textwrap.wrap(stripped, width=90) or [""]
for wrapped in wrapped_lines:
c.drawString(margin, y_position, wrapped)
y_position -= 14

134
list_notion_pages.py Normal file
View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""
Script para listar todas las páginas y bases de datos accesibles
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from config import settings
from notion_client import Client
def main():
print("\n" + "=" * 70)
print("📚 LISTANDO TODAS LAS PÁGINAS Y BASES DE DATOS")
print("=" * 70 + "\n")
token = settings.NOTION_API_TOKEN
client = Client(auth=token)
try:
# Buscar todas las páginas sin filtro
print("🔍 Buscando todas las páginas accesibles...\n")
results = client.search(page_size=100)
all_items = results.get("results", [])
# Separar bases de datos y páginas
databases = [item for item in all_items if item.get("object") == "database"]
pages = [item for item in all_items if item.get("object") == "page"]
print(
f"✅ Encontrados: {len(databases)} base(s) de datos y {len(pages)} página(s)\n"
)
if databases:
print("=" * 70)
print("📊 BASES DE DATOS ENCONTRADAS:")
print("=" * 70)
for i, db in enumerate(databases, 1):
db_id = db.get("id", "N/A")
title_list = db.get("title", [])
title = (
title_list[0].get("plain_text", "Sin título")
if title_list
else "Sin título"
)
print(f"\n🔷 {i}. {title}")
print(f" ID: {db_id}")
print(f" URL: https://notion.so/{db_id.replace('-', '')}")
# Mostrar propiedades
props = db.get("properties", {})
if props:
print(f" Propiedades:")
for prop_name, prop_data in list(props.items())[:5]:
prop_type = prop_data.get("type", "unknown")
print(f"{prop_name} ({prop_type})")
if len(props) > 5:
print(f" ... y {len(props) - 5} más")
print("-" * 70)
if pages:
print("\n" + "=" * 70)
print("📄 PÁGINAS ENCONTRADAS:")
print("=" * 70)
for i, page in enumerate(pages, 1):
page_id = page.get("id", "N/A")
# Intentar obtener el título
title = "Sin título"
props = page.get("properties", {})
# Buscar en diferentes ubicaciones del título
if "title" in props:
title_prop = props["title"]
if "title" in title_prop:
title_list = title_prop["title"]
if title_list:
title = title_list[0].get("plain_text", "Sin título")
elif "Name" in props:
name_prop = props["Name"]
if "title" in name_prop:
title_list = name_prop["title"]
if title_list:
title = title_list[0].get("plain_text", "Sin título")
print(f"\n🔷 {i}. {title}")
print(f" ID: {page_id}")
print(f" URL: https://notion.so/{page_id.replace('-', '')}")
print("-" * 70)
if databases:
print("\n" + "=" * 70)
print("💡 SIGUIENTE PASO:")
print("=" * 70)
print("\nSi 'CBC' aparece arriba como BASE DE DATOS:")
print("1. Copia el ID de la base de datos 'CBC'")
print("2. Actualiza tu .env:")
print(" NOTION_DATABASE_ID=<el_id_completo>")
print("\nSi 'CBC' aparece como PÁGINA:")
print("1. Abre la página en Notion")
print("2. Busca una base de datos dentro de esa página")
print("3. Haz click en '...' de la base de datos")
print("4. Selecciona 'Copy link to view'")
print("5. El ID estará en el URL copiado")
print("\n4. Ejecuta: python test_notion_integration.py\n")
else:
print("\n⚠️ No se encontraron bases de datos accesibles.")
print("\n📋 OPCIONES:")
print("\n1. Crear una nueva base de datos:")
print(" - Abre una de las páginas listadas arriba")
print(" - Crea una tabla/database dentro")
print(" - Copia el ID de esa base de datos")
print("\n2. O comparte una base de datos existente:")
print(" - Abre tu base de datos 'CBC' en Notion")
print(" - Click en '...' > 'Connections'")
print(" - Agrega tu integración\n")
except Exception as e:
print(f"❌ Error: {e}\n")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()

186
main.py
View File

@@ -3,6 +3,7 @@
CBCFacil - Main Service Entry Point
Unified AI service for document processing (audio, PDF, text)
"""
import logging
import sys
import time
@@ -16,8 +17,10 @@ from typing import Optional
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()
# Configure logging with JSON formatter for production
class JSONFormatter(logging.Formatter):
"""JSON formatter for structured logging in production"""
@@ -29,7 +32,7 @@ class JSONFormatter(logging.Formatter):
"message": record.getMessage(),
"module": record.module,
"function": record.funcName,
"line": record.lineno
"line": record.lineno,
}
# Add exception info if present
@@ -55,9 +58,9 @@ def setup_logging() -> logging.Logger:
if settings.is_production:
console_handler.setFormatter(JSONFormatter())
else:
console_handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] - %(name)s - %(message)s"
))
console_handler.setFormatter(
logging.Formatter("%(asctime)s [%(levelname)s] - %(name)s - %(message)s")
)
logger.addHandler(console_handler)
# File handler if configured
@@ -74,9 +77,12 @@ logger = setup_logging()
def acquire_lock() -> int:
"""Acquire single instance lock"""
lock_file = Path(os.getenv("LOCAL_STATE_DIR", str(Path(__file__).parent))) / ".main_service.lock"
lock_file = (
Path(os.getenv("LOCAL_STATE_DIR", str(Path(__file__).parent)))
/ ".main_service.lock"
)
lock_file.parent.mkdir(parents=True, exist_ok=True)
lock_fd = open(lock_file, 'w')
lock_fd = open(lock_file, "w")
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
lock_fd.write(str(os.getpid()))
lock_fd.flush()
@@ -100,7 +106,9 @@ def validate_configuration() -> None:
try:
warnings = validate_environment()
if warnings:
logger.info(f"Configuration validation completed with {len(warnings)} warnings")
logger.info(
f"Configuration validation completed with {len(warnings)} warnings"
)
except ConfigurationError as e:
logger.error(f"Configuration validation failed: {e}")
raise
@@ -117,7 +125,7 @@ def check_service_health() -> dict:
health_status = {
"timestamp": datetime.utcnow().isoformat(),
"status": "healthy",
"services": {}
"services": {},
}
# Check WebDAV
@@ -129,15 +137,13 @@ def check_service_health() -> dict:
else:
health_status["services"]["webdav"] = {"status": "not_configured"}
except Exception as e:
health_status["services"]["webdav"] = {
"status": "unhealthy",
"error": str(e)
}
health_status["services"]["webdav"] = {"status": "unhealthy", "error": str(e)}
health_status["status"] = "degraded"
# Check Telegram
try:
from services.telegram_service import telegram_service
if telegram_service.is_configured:
health_status["services"]["telegram"] = {"status": "healthy"}
else:
@@ -145,22 +151,20 @@ def check_service_health() -> dict:
except Exception as e:
health_status["services"]["telegram"] = {
"status": "unavailable",
"error": str(e)
"error": str(e),
}
# Check VRAM manager
try:
from services.vram_manager import vram_manager
vram_info = vram_manager.get_vram_info()
health_status["services"]["vram"] = {
"status": "healthy",
"available_gb": vram_info.get("free", 0) / (1024**3)
"available_gb": vram_info.get("free", 0) / (1024**3),
}
except Exception as e:
health_status["services"]["vram"] = {
"status": "unavailable",
"error": str(e)
}
health_status["services"]["vram"] = {"status": "unavailable", "error": str(e)}
return health_status
@@ -189,12 +193,28 @@ def initialize_services() -> None:
# Configure Telegram if credentials available
if settings.TELEGRAM_TOKEN and settings.TELEGRAM_CHAT_ID:
try:
telegram_service.configure(settings.TELEGRAM_TOKEN, settings.TELEGRAM_CHAT_ID)
telegram_service.configure(
settings.TELEGRAM_TOKEN, settings.TELEGRAM_CHAT_ID
)
telegram_service.send_start_notification()
logger.info("Telegram notifications enabled")
except Exception as e:
logger.error(f"Failed to configure Telegram: {e}")
# Configure Notion if credentials available
if settings.has_notion_config:
try:
from services.notion_service import notion_service
notion_service.configure(
settings.NOTION_API_TOKEN, settings.NOTION_DATABASE_ID
)
logger.info("✅ Notion integration enabled")
except Exception as e:
logger.error(f"Failed to configure Notion: {e}")
else:
logger.info("Notion not configured - upload to Notion disabled")
# Initialize WebDAV if configured
if settings.has_webdav_config:
try:
@@ -233,6 +253,7 @@ def send_error_notification(error_type: str, error_message: str) -> None:
"""Send error notification via Telegram"""
try:
from services.telegram_service import telegram_service
if telegram_service.is_configured:
telegram_service.send_error_notification(error_type, error_message)
except Exception as e:
@@ -243,15 +264,16 @@ def run_dashboard_thread() -> None:
"""Run Flask dashboard in a separate thread"""
try:
from api.routes import create_app
app = create_app()
# Run Flask in production mode with threaded=True
app.run(
host='0.0.0.0',
host="0.0.0.0",
port=5000,
debug=False,
threaded=True,
use_reloader=False # Important: disable reloader in thread
use_reloader=False, # Important: disable reloader in thread
)
except Exception as e:
logger.error(f"Dashboard thread error: {e}")
@@ -260,14 +282,12 @@ def run_dashboard_thread() -> None:
def start_dashboard() -> threading.Thread:
"""Start dashboard in a background daemon thread"""
dashboard_port = int(os.getenv('DASHBOARD_PORT', '5000'))
dashboard_port = int(os.getenv("DASHBOARD_PORT", "5000"))
logger.info(f"Starting dashboard on port {dashboard_port}...")
# Create daemon thread so it doesn't block shutdown
dashboard_thread = threading.Thread(
target=run_dashboard_thread,
name="DashboardThread",
daemon=True
target=run_dashboard_thread, name="DashboardThread", daemon=True
)
dashboard_thread.start()
logger.info(f"Dashboard thread started (Thread-ID: {dashboard_thread.ident})")
@@ -301,9 +321,37 @@ def run_main_loop() -> None:
webdav_service.mkdir(settings.REMOTE_PDF_FOLDER)
pdf_files = webdav_service.list(settings.REMOTE_PDF_FOLDER)
for file_path in pdf_files:
if file_path.lower().endswith('.pdf'):
if file_path.lower().endswith(".pdf"):
if not processed_registry.is_processed(file_path):
pdf_processor.process(file_path)
from pathlib import Path
from urllib.parse import unquote
from services.telegram_service import telegram_service
local_filename = unquote(Path(file_path).name)
base_name = Path(local_filename).stem
local_path = (
settings.LOCAL_DOWNLOADS_PATH / local_filename
)
settings.LOCAL_DOWNLOADS_PATH.mkdir(
parents=True, exist_ok=True
)
# Step 1: Notify and download
telegram_service.send_message(
f"📄 Nuevo PDF detectado: {local_filename}\n"
f"⬇️ Descargando..."
)
logger.info(
f"Downloading PDF: {file_path} -> {local_path}"
)
webdav_service.download(file_path, local_path)
# Step 2: Process PDF
telegram_service.send_message(
f"🔍 Procesando PDF con OCR..."
)
pdf_processor.process(str(local_path))
processed_registry.save(file_path)
except Exception as e:
logger.exception(f"Error processing PDFs: {e}")
@@ -314,7 +362,10 @@ def run_main_loop() -> None:
try:
audio_files = webdav_service.list(settings.REMOTE_AUDIOS_FOLDER)
for file_path in audio_files:
if any(file_path.lower().endswith(ext) for ext in settings.AUDIO_EXTENSIONS):
if any(
file_path.lower().endswith(ext)
for ext in settings.AUDIO_EXTENSIONS
):
if not processed_registry.is_processed(file_path):
from pathlib import Path
from urllib.parse import unquote
@@ -323,36 +374,55 @@ def run_main_loop() -> None:
local_filename = unquote(Path(file_path).name)
base_name = Path(local_filename).stem
local_path = settings.LOCAL_DOWNLOADS_PATH / local_filename
settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True)
local_path = (
settings.LOCAL_DOWNLOADS_PATH / local_filename
)
settings.LOCAL_DOWNLOADS_PATH.mkdir(
parents=True, exist_ok=True
)
# Step 1: Notify and download
telegram_service.send_message(
f"🎵 Nuevo audio detectado: {local_filename}\n"
f"⬇️ Descargando..."
)
logger.info(f"Downloading audio: {file_path} -> {local_path}")
logger.info(
f"Downloading audio: {file_path} -> {local_path}"
)
webdav_service.download(file_path, local_path)
# Step 2: Transcribe
telegram_service.send_message(f"📝 Transcribiendo audio con Whisper...")
telegram_service.send_message(
f"📝 Transcribiendo audio con Whisper..."
)
result = audio_processor.process(str(local_path))
if result.get("success") and result.get("transcription_path"):
transcription_file = Path(result["transcription_path"])
if result.get("success") and result.get(
"transcription_path"
):
transcription_file = Path(
result["transcription_path"]
)
transcription_text = result.get("text", "")
# Step 3: Generate AI summary and documents
telegram_service.send_message(f"🤖 Generando resumen con IA...")
telegram_service.send_message(
f"🤖 Generando resumen con IA..."
)
doc_generator = DocumentGenerator()
success, summary, output_files = doc_generator.generate_summary(
success, summary, output_files = (
doc_generator.generate_summary(
transcription_text, base_name
)
)
# Step 4: Upload all files to Nextcloud
if success and output_files:
# Create folders
for folder in [settings.RESUMENES_FOLDER, settings.DOCX_FOLDER]:
for folder in [
settings.RESUMENES_FOLDER,
settings.DOCX_FOLDER,
]:
try:
webdav_service.makedirs(folder)
except Exception:
@@ -361,25 +431,35 @@ def run_main_loop() -> None:
# Upload transcription TXT
if transcription_file.exists():
remote_txt = f"{settings.RESUMENES_FOLDER}/{transcription_file.name}"
webdav_service.upload(transcription_file, remote_txt)
webdav_service.upload(
transcription_file, remote_txt
)
logger.info(f"Uploaded: {remote_txt}")
# Upload DOCX
docx_path = Path(output_files.get('docx_path', ''))
docx_path = Path(
output_files.get("docx_path", "")
)
if docx_path.exists():
remote_docx = f"{settings.DOCX_FOLDER}/{docx_path.name}"
webdav_service.upload(docx_path, remote_docx)
webdav_service.upload(
docx_path, remote_docx
)
logger.info(f"Uploaded: {remote_docx}")
# Upload PDF
pdf_path = Path(output_files.get('pdf_path', ''))
pdf_path = Path(
output_files.get("pdf_path", "")
)
if pdf_path.exists():
remote_pdf = f"{settings.DOCX_FOLDER}/{pdf_path.name}"
webdav_service.upload(pdf_path, remote_pdf)
logger.info(f"Uploaded: {remote_pdf}")
# Upload Markdown
md_path = Path(output_files.get('markdown_path', ''))
md_path = Path(
output_files.get("markdown_path", "")
)
if md_path.exists():
remote_md = f"{settings.RESUMENES_FOLDER}/{md_path.name}"
webdav_service.upload(md_path, remote_md)
@@ -396,11 +476,15 @@ def run_main_loop() -> None:
# Just upload transcription if summary failed
if transcription_file.exists():
try:
webdav_service.makedirs(settings.RESUMENES_FOLDER)
webdav_service.makedirs(
settings.RESUMENES_FOLDER
)
except Exception:
pass
remote_txt = f"{settings.RESUMENES_FOLDER}/{transcription_file.name}"
webdav_service.upload(transcription_file, remote_txt)
webdav_service.upload(
transcription_file, remote_txt
)
telegram_service.send_message(
f"⚠️ Resumen fallido, solo transcripción subida:\n{transcription_file.name}"
)
@@ -415,7 +499,10 @@ def run_main_loop() -> None:
try:
text_files = webdav_service.list(settings.REMOTE_TXT_FOLDER)
for file_path in text_files:
if any(file_path.lower().endswith(ext) for ext in settings.TXT_EXTENSIONS):
if any(
file_path.lower().endswith(ext)
for ext in settings.TXT_EXTENSIONS
):
if not processed_registry.is_processed(file_path):
text_processor.process(file_path)
processed_registry.save(file_path)
@@ -443,7 +530,7 @@ def run_main_loop() -> None:
)
send_error_notification(
"consecutive_errors",
f"Service has failed {consecutive_errors} consecutive times"
f"Service has failed {consecutive_errors} consecutive times",
)
# Don't exit, let the loop continue with backoff
@@ -462,7 +549,9 @@ def main():
try:
logger.info("=== CBCFacil Service Started ===")
logger.info(f"Version: {os.getenv('APP_VERSION', '8.0')}")
logger.info(f"Environment: {'production' if os.getenv('DEBUG', 'false').lower() != 'true' else 'development'}")
logger.info(
f"Environment: {'production' if os.getenv('DEBUG', 'false').lower() != 'true' else 'development'}"
)
lock_fd = acquire_lock()
initialize_services()
@@ -491,12 +580,15 @@ if __name__ == "__main__":
command = sys.argv[1]
if command == "whisper" and len(sys.argv) == 4:
from processors.audio_processor import AudioProcessor
AudioProcessor().process(sys.argv[2])
elif command == "pdf" and len(sys.argv) == 4:
from processors.pdf_processor import PDFProcessor
PDFProcessor().process(sys.argv[2])
elif command == "health":
from main import check_service_health
health = check_service_health()
print(json.dumps(health, indent=2))
else:

2447
opus.md Normal file

File diff suppressed because it is too large Load Diff

10
restart_service.sh Executable file
View File

@@ -0,0 +1,10 @@
#!/bin/bash
# Detener servicio existente
pkill -f "python main.py"
sleep 2
# Reiniciar con log visible
cd /home/ren/proyectos/cbc
source .venv/bin/activate
python main.py >> main.log 2>&1 &
echo "Servicio reiniciado. Ver logs con: tail -f main.log"

353
services/notion_service.py Normal file
View File

@@ -0,0 +1,353 @@
"""
Notion integration service with official SDK
"""
import logging
from typing import Optional, Dict, Any, List
from pathlib import Path
from datetime import datetime
import time
try:
from notion_client import Client
from notion_client.errors import APIResponseError
NOTION_AVAILABLE = True
except ImportError:
NOTION_AVAILABLE = False
Client = None
APIResponseError = Exception
from config import settings
class NotionService:
"""Enhanced Notion API integration service"""
def __init__(self):
self.logger = logging.getLogger(__name__)
self._client: Optional[Client] = None
self._database_id: Optional[str] = None
def configure(self, token: str, database_id: str) -> None:
"""Configure Notion with official SDK"""
if not NOTION_AVAILABLE:
self.logger.error(
"notion-client not installed. Install with: pip install notion-client"
)
return
self._client = Client(auth=token)
self._database_id = database_id
self.logger.info("Notion service configured with official SDK")
@property
def is_configured(self) -> bool:
"""Check if Notion is configured"""
return bool(self._client and self._database_id and NOTION_AVAILABLE)
def _rate_limited_request(self, func, *args, **kwargs):
"""Execute request with rate limiting and retry"""
max_retries = 3
base_delay = 1
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except APIResponseError as e:
if hasattr(e, "code") and e.code == "rate_limited":
delay = base_delay * (2**attempt)
self.logger.warning(f"Rate limited by Notion, waiting {delay}s")
time.sleep(delay)
else:
raise
raise Exception("Max retries exceeded for Notion API")
def create_page_with_summary(
self, title: str, summary: str, metadata: Dict[str, Any]
) -> Optional[str]:
"""Create a new page in Notion (database or parent page) with summary content"""
if not self.is_configured:
self.logger.warning("Notion not configured, skipping upload")
return None
try:
# Determinar si es database o página padre
use_as_page = metadata.get("use_as_page", False)
if use_as_page:
# Crear página dentro de otra página
page = self._rate_limited_request(
self._client.pages.create,
parent={"page_id": self._database_id},
properties={"title": [{"text": {"content": title[:100]}}]},
)
else:
# Crear página en database (método original)
properties = {"Name": {"title": [{"text": {"content": title[:100]}}]}}
# Agregar status si la DB lo soporta
if metadata.get("add_status", True):
properties["Status"] = {"select": {"name": "Procesado"}}
# Agregar tipo de archivo si está disponible Y add_status está habilitado
if metadata.get("add_status", False) and metadata.get("file_type"):
properties["Tipo"] = {
"select": {" name": metadata["file_type"].upper()}
}
page = self._rate_limited_request(
self._client.pages.create,
parent={"database_id": self._database_id},
properties=properties,
)
page_id = page["id"]
self.logger.info(f"✅ Notion page created: {page_id}")
# Agregar contenido del resumen como bloques
self._add_summary_content(page_id, summary, metadata.get("pdf_path"))
return page_id
except Exception as e:
self.logger.error(f"❌ Error creating Notion page: {e}")
return None
try:
# Preparar properties de la página
properties = {
"Name": {
"title": [
{
"text": {
"content": title[:100] # Notion limit
}
}
]
}
}
# Agregar status si la DB lo soporta
if metadata.get("add_status", True):
properties["Status"] = {"select": {"name": "Procesado"}}
# Agregar tipo de archivo si está disponible
if metadata.get("file_type"):
properties["Tipo"] = {"select": {"name": metadata["file_type"].upper()}}
# Crear página
page = self._rate_limited_request(
self._client.pages.create,
parent={"database_id": self._database_id},
properties=properties,
)
page_id = page["id"]
self.logger.info(f"✅ Notion page created: {page_id}")
# Agregar contenido del resumen como bloques
self._add_summary_content(page_id, summary, metadata.get("pdf_path"))
return page_id
except Exception as e:
self.logger.error(f"❌ Error creating Notion page: {e}")
return None
def _add_summary_content(
self, page_id: str, summary: str, pdf_path: Optional[Path] = None
) -> bool:
"""Add summary content as Notion blocks"""
try:
blocks = []
# Agregar nota sobre el PDF si existe
if pdf_path and pdf_path.exists():
blocks.append(
{
"object": "block",
"type": "callout",
"callout": {
"rich_text": [
{
"type": "text",
"text": {
"content": f"📄 Documento generado automáticamente: {pdf_path.name}"
},
}
],
"icon": {"emoji": "📄"},
},
}
)
# Agregar bloques del resumen
summary_blocks = self._parse_markdown_to_blocks(summary)
blocks.extend(summary_blocks)
# Agregar footer
blocks.append({"object": "block", "type": "divider", "divider": {}})
blocks.append(
{
"object": "block",
"type": "paragraph",
"paragraph": {
"rich_text": [
{
"type": "text",
"text": {
"content": f"Generado por CBCFacil el {datetime.now().strftime('%d/%m/%Y %H:%M')}"
},
"annotations": {"italic": True, "color": "gray"},
}
]
},
}
)
# Notion API limita a 100 bloques por request
if blocks:
for i in range(0, len(blocks), 100):
batch = blocks[i : i + 100]
self._rate_limited_request(
self._client.blocks.children.append,
block_id=page_id,
children=batch,
)
self.logger.info(f"✅ Added {len(blocks)} blocks to Notion page")
return True
except Exception as e:
self.logger.error(f"❌ Error adding content blocks: {e}")
return False
def _parse_markdown_to_blocks(self, markdown: str) -> List[Dict]:
"""Convert markdown to Notion blocks"""
blocks = []
lines = markdown.split("\n")
for line in lines:
line = line.strip()
if not line:
continue
# Headings
if line.startswith("# "):
text = line[2:].strip()[:2000]
if text:
blocks.append(
{
"object": "block",
"type": "heading_1",
"heading_1": {
"rich_text": [
{"type": "text", "text": {"content": text}}
]
},
}
)
elif line.startswith("## "):
text = line[3:].strip()[:2000]
if text:
blocks.append(
{
"object": "block",
"type": "heading_2",
"heading_2": {
"rich_text": [
{"type": "text", "text": {"content": text}}
]
},
}
)
elif line.startswith("### "):
text = line[4:].strip()[:2000]
if text:
blocks.append(
{
"object": "block",
"type": "heading_3",
"heading_3": {
"rich_text": [
{"type": "text", "text": {"content": text}}
]
},
}
)
# Bullet points
elif line.startswith("- ") or line.startswith("* "):
text = line[2:].strip()[:2000]
if text:
blocks.append(
{
"object": "block",
"type": "bulleted_list_item",
"bulleted_list_item": {
"rich_text": [
{"type": "text", "text": {"content": text}}
]
},
}
)
# Divider
elif line.strip() == "---":
blocks.append({"object": "block", "type": "divider", "divider": {}})
# Paragraph (skip footer lines)
elif not line.startswith("*Generado por"):
text = line[:2000]
if text:
blocks.append(
{
"object": "block",
"type": "paragraph",
"paragraph": {
"rich_text": [
{"type": "text", "text": {"content": text}}
]
},
}
)
return blocks
def upload_pdf_legacy(self, pdf_path: Path, title: str) -> bool:
"""Legacy method - creates simple page (backward compatibility)"""
if not self.is_configured:
self.logger.warning("Notion not configured, skipping upload")
return False
try:
# Crear página simple
page_id = self.create_page_with_summary(
title=title,
summary=f"Documento procesado: {title}",
metadata={"file_type": "PDF", "pdf_path": pdf_path},
)
return bool(page_id)
except Exception as e:
self.logger.error(f"Error uploading PDF to Notion: {e}")
return False
# Alias para backward compatibility
def upload_pdf(self, pdf_path: Path, title: str) -> bool:
"""Upload PDF info to Notion (alias for backward compatibility)"""
return self.upload_pdf_legacy(pdf_path, title)
def upload_pdf_as_file(self, pdf_path: Path, title: str) -> bool:
"""Upload PDF info as file (alias for backward compatibility)"""
return self.upload_pdf_legacy(pdf_path, title)
# Global instance
notion_service = NotionService()
def upload_to_notion(pdf_path: Path, title: str) -> bool:
"""Legacy function for backward compatibility"""
return notion_service.upload_pdf(pdf_path, title)

View File

@@ -0,0 +1,203 @@
"""
Notion integration service
"""
import logging
import base64
from typing import Optional
from pathlib import Path
try:
import requests
REQUESTS_AVAILABLE = True
except ImportError:
REQUESTS_AVAILABLE = False
requests = None
from config import settings
class NotionService:
"""Service for Notion API integration"""
def __init__(self):
self.logger = logging.getLogger(__name__)
self._token: Optional[str] = None
self._database_id: Optional[str] = None
self._base_url = "https://api.notion.com/v1"
def configure(self, token: str, database_id: str) -> None:
"""Configure Notion credentials"""
self._token = token
self._database_id = database_id
self.logger.info("Notion service configured")
@property
def is_configured(self) -> bool:
"""Check if Notion is configured"""
return bool(self._token and self._database_id)
def _get_headers(self) -> dict:
"""Get headers for Notion API requests"""
return {
"Authorization": f"Bearer {self._token}",
"Content-Type": "application/json",
"Notion-Version": "2022-06-28"
}
def upload_pdf(self, pdf_path: Path, title: str) -> bool:
"""Upload PDF to Notion database"""
if not self.is_configured:
self.logger.warning("Notion not configured, skipping upload")
return False
if not REQUESTS_AVAILABLE:
self.logger.error("requests library not available for Notion upload")
return False
if not pdf_path.exists():
self.logger.error(f"PDF file not found: {pdf_path}")
return False
try:
# Read and encode PDF
with open(pdf_path, 'rb') as f:
pdf_data = base64.b64encode(f.read()).decode('utf-8')
# Prepare the page data
page_data = {
"parent": {"database_id": self._database_id},
"properties": {
"Name": {
"title": [
{
"text": {
"content": title
}
}
]
},
"Status": {
"select": {
"name": "Procesado"
}
}
},
"children": [
{
"object": "block",
"type": "paragraph",
"paragraph": {
"rich_text": [
{
"type": "text",
"text": {
"content": f"Documento generado automáticamente: {title}"
}
}
]
}
},
{
"object": "block",
"type": "file",
"file": {
"type": "external",
"external": {
"url": f"data:application/pdf;base64,{pdf_data}"
}
}
}
]
}
# Create page in database
response = requests.post(
f"{self._base_url}/pages",
headers=self._get_headers(),
json=page_data,
timeout=30
)
if response.status_code == 200:
self.logger.info(f"PDF uploaded to Notion successfully: {title}")
return True
else:
self.logger.error(f"Notion API error: {response.status_code} - {response.text}")
return False
except Exception as e:
self.logger.error(f"Error uploading PDF to Notion: {e}")
return False
def upload_pdf_as_file(self, pdf_path: Path, title: str) -> bool:
"""Upload PDF as a file block (alternative method)"""
if not self.is_configured:
self.logger.warning("Notion not configured, skipping upload")
return False
if not REQUESTS_AVAILABLE:
self.logger.error("requests library not available for Notion upload")
return False
if not pdf_path.exists():
self.logger.error(f"PDF file not found: {pdf_path}")
return False
try:
# For simplicity, we'll create a page with just the title and a link placeholder
# In a real implementation, you'd need to upload the file to Notion's file storage
page_data = {
"parent": {"database_id": self._database_id},
"properties": {
"Name": {
"title": [
{
"text": {
"content": title
}
}
]
},
"Status": {
"select": {
"name": "Procesado"
}
},
"File Path": {
"rich_text": [
{
"text": {
"content": str(pdf_path)
}
}
]
}
}
}
response = requests.post(
f"{self._base_url}/pages",
headers=self._get_headers(),
json=page_data,
timeout=30
)
if response.status_code == 200:
self.logger.info(f"PDF uploaded to Notion successfully: {title}")
return True
else:
self.logger.error(f"Notion API error: {response.status_code} - {response.text}")
return False
except Exception as e:
self.logger.error(f"Error uploading PDF to Notion: {e}")
return False
# Global instance
notion_service = NotionService()
def upload_to_notion(pdf_path: Path, title: str) -> bool:
"""Legacy function for backward compatibility"""
return notion_service.upload_pdf(pdf_path, title)

View File

@@ -0,0 +1,95 @@
#!/usr/bin/env python3
"""
Script para verificar y configurar permisos de Notion
"""
import sys
import logging
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from config import settings
from notion_client import Client
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
def main():
print("\n" + "=" * 60)
print("🔧 VERIFICACIÓN DE PERMISOS DE NOTION")
print("=" * 60 + "\n")
# Configuración
token = settings.NOTION_API_TOKEN
database_id = settings.NOTION_DATABASE_ID
if not token or not database_id:
print("❌ Falta configuración de Notion en .env")
print(f" NOTION_API: {'' if token else ''}")
print(f" NOTION_DATABASE_ID: {'' if database_id else ''}")
return
print(f"✅ Token configurado: {token[:20]}...")
print(f"✅ Database ID: {database_id}\n")
# Crear cliente
client = Client(auth=token)
print("📋 PASOS PARA CONFIGURAR LOS PERMISOS:\n")
print("1. Abre Notion y ve a tu base de datos 'CBC'")
print(f" URL: https://www.notion.so/{database_id}")
print("\n2. Click en los 3 puntos (⋯) en la esquina superior derecha")
print("\n3. Selecciona 'Connections' o 'Añadir conexiones'")
print("\n4. Busca tu integración y actívala")
print(f" (Debería aparecer con el nombre que le pusiste)")
print("\n5. Confirma los permisos\n")
print("-" * 60)
print("\n🧪 Intentando conectar con Notion...\n")
try:
# Intentar obtener la base de datos
database = client.databases.retrieve(database_id=database_id)
print("✅ ¡ÉXITO! La integración puede acceder a la base de datos")
print(f"\n📊 Información de la base de datos:")
print(
f" Título: {database['title'][0]['plain_text'] if database.get('title') else 'Sin título'}"
)
print(f" ID: {database['id']}")
print(f"\n Propiedades disponibles:")
for prop_name, prop_data in database.get("properties", {}).items():
prop_type = prop_data.get("type", "unknown")
print(f" - {prop_name}: {prop_type}")
print("\n" + "=" * 60)
print("✅ TODO CONFIGURADO CORRECTAMENTE")
print("=" * 60 + "\n")
print("🚀 Ahora ejecuta: python test_notion_integration.py")
print(" para probar subir un documento\n")
except Exception as e:
error_msg = str(e)
print("❌ ERROR AL CONECTAR CON NOTION\n")
print(f"Error: {error_msg}\n")
if "Could not find database" in error_msg:
print("⚠️ LA BASE DE DATOS NO ESTÁ COMPARTIDA CON TU INTEGRACIÓN")
print("\nSigue los pasos arriba para compartir la base de datos.")
elif "Unauthorized" in error_msg or "401" in error_msg:
print("⚠️ EL TOKEN DE API ES INVÁLIDO")
print("\nVerifica que el token esté correcto en .env")
else:
print("⚠️ ERROR DESCONOCIDO")
print(f"\nDetalles: {error_msg}")
print("\n" + "=" * 60 + "\n")
if __name__ == "__main__":
main()