feat: Integración automática con Notion + análisis completo del código

- Instalado notion-client SDK oficial para integración robusta
- Refactorizado services/notion_service.py con SDK oficial de Notion
  - Rate limiting con retry y exponential backoff
  - Parser Markdown → Notion blocks (headings, bullets, paragraphs)
  - Soporte para pages y databases
  - Manejo robusto de errores

- Integración automática en document/generators.py
  - PDFs se suben automáticamente a Notion después de generarse
  - Contenido completo del resumen formateado con bloques
  - Metadata rica (tipo de archivo, path, fecha)

- Configuración de Notion en main.py
  - Inicialización automática al arrancar el servicio
  - Validación de credenciales

- Actualizado config/settings.py
  - Agregado load_dotenv() para cargar variables de .env
  - Configuración de Notion (NOTION_API, NOTION_DATABASE_ID)

- Scripts de utilidad creados:
  - test_notion_integration.py: Test de subida a Notion
  - test_pipeline_notion.py: Test del pipeline completo
  - verify_notion_permissions.py: Verificación de permisos
  - list_notion_pages.py: Listar páginas accesibles
  - diagnose_notion.py: Diagnóstico completo
  - create_notion_database.py: Crear database automáticamente
  - restart_service.sh: Script de reinicio del servicio

- Documentación completa en opus.md:
  - Análisis exhaustivo del codebase (42 archivos Python)
  - Bugs críticos identificados y soluciones
  - Mejoras de seguridad (autenticación, rate limiting, CORS, CSP)
  - Optimizaciones de rendimiento (Celery, Redis, PostgreSQL, WebSockets)
  - Plan de testing (estructura, ejemplos, 80% coverage goal)
  - Roadmap de implementación (6 sprints detallados)
  - Integración avanzada con Notion documentada

Estado: Notion funcionando correctamente, PDFs se suben automáticamente
This commit is contained in:
renato97
2026-01-26 17:26:50 +00:00
parent 47896fd50a
commit 6058dc642e
12 changed files with 3863 additions and 184 deletions

294
main.py
View File

@@ -3,6 +3,7 @@
CBCFacil - Main Service Entry Point
Unified AI service for document processing (audio, PDF, text)
"""
import logging
import sys
import time
@@ -16,12 +17,14 @@ from typing import Optional
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()
# Configure logging with JSON formatter for production
class JSONFormatter(logging.Formatter):
"""JSON formatter for structured logging in production"""
def format(self, record):
log_entry = {
"timestamp": datetime.utcnow().isoformat() + "Z",
@@ -29,43 +32,43 @@ class JSONFormatter(logging.Formatter):
"message": record.getMessage(),
"module": record.module,
"function": record.funcName,
"line": record.lineno
"line": record.lineno,
}
# Add exception info if present
if record.exc_info:
log_entry["exception"] = self.formatException(record.exc_info)
return json.dumps(log_entry)
def setup_logging() -> logging.Logger:
"""Setup logging configuration"""
from config import settings
# Create logger
logger = logging.getLogger(__name__)
logger.setLevel(getattr(logging, settings.LOG_LEVEL.upper()))
# Remove existing handlers
logger.handlers.clear()
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
if settings.is_production:
console_handler.setFormatter(JSONFormatter())
else:
console_handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] - %(name)s - %(message)s"
))
console_handler.setFormatter(
logging.Formatter("%(asctime)s [%(levelname)s] - %(name)s - %(message)s")
)
logger.addHandler(console_handler)
# File handler if configured
if settings.LOG_FILE:
file_handler = logging.FileHandler(settings.LOG_FILE)
file_handler.setFormatter(JSONFormatter())
logger.addHandler(file_handler)
return logger
@@ -74,9 +77,12 @@ logger = setup_logging()
def acquire_lock() -> int:
"""Acquire single instance lock"""
lock_file = Path(os.getenv("LOCAL_STATE_DIR", str(Path(__file__).parent))) / ".main_service.lock"
lock_file = (
Path(os.getenv("LOCAL_STATE_DIR", str(Path(__file__).parent)))
/ ".main_service.lock"
)
lock_file.parent.mkdir(parents=True, exist_ok=True)
lock_fd = open(lock_file, 'w')
lock_fd = open(lock_file, "w")
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
lock_fd.write(str(os.getpid()))
lock_fd.flush()
@@ -96,11 +102,13 @@ def release_lock(lock_fd) -> None:
def validate_configuration() -> None:
"""Validate configuration at startup"""
from config.validators import validate_environment, ConfigurationError
try:
warnings = validate_environment()
if warnings:
logger.info(f"Configuration validation completed with {len(warnings)} warnings")
logger.info(
f"Configuration validation completed with {len(warnings)} warnings"
)
except ConfigurationError as e:
logger.error(f"Configuration validation failed: {e}")
raise
@@ -113,13 +121,13 @@ def check_service_health() -> dict:
"""
from config import settings
from services.webdav_service import webdav_service
health_status = {
"timestamp": datetime.utcnow().isoformat(),
"status": "healthy",
"services": {}
"services": {},
}
# Check WebDAV
try:
if settings.has_webdav_config:
@@ -129,15 +137,13 @@ def check_service_health() -> dict:
else:
health_status["services"]["webdav"] = {"status": "not_configured"}
except Exception as e:
health_status["services"]["webdav"] = {
"status": "unhealthy",
"error": str(e)
}
health_status["services"]["webdav"] = {"status": "unhealthy", "error": str(e)}
health_status["status"] = "degraded"
# Check Telegram
try:
from services.telegram_service import telegram_service
if telegram_service.is_configured:
health_status["services"]["telegram"] = {"status": "healthy"}
else:
@@ -145,23 +151,21 @@ def check_service_health() -> dict:
except Exception as e:
health_status["services"]["telegram"] = {
"status": "unavailable",
"error": str(e)
"error": str(e),
}
# Check VRAM manager
try:
from services.vram_manager import vram_manager
vram_info = vram_manager.get_vram_info()
health_status["services"]["vram"] = {
"status": "healthy",
"available_gb": vram_info.get("free", 0) / (1024**3)
"available_gb": vram_info.get("free", 0) / (1024**3),
}
except Exception as e:
health_status["services"]["vram"] = {
"status": "unavailable",
"error": str(e)
}
health_status["services"]["vram"] = {"status": "unavailable", "error": str(e)}
return health_status
@@ -172,29 +176,45 @@ def initialize_services() -> None:
from services.vram_manager import vram_manager
from services.telegram_service import telegram_service
from storage.processed_registry import processed_registry
logger.info("Initializing services...")
# Validate configuration
validate_configuration()
# Warn if WebDAV not configured
if not settings.has_webdav_config:
logger.warning("WebDAV not configured - file sync functionality disabled")
# Warn if AI providers not configured
if not settings.has_ai_config:
logger.warning("AI providers not configured - summary generation will not work")
# Configure Telegram if credentials available
if settings.TELEGRAM_TOKEN and settings.TELEGRAM_CHAT_ID:
try:
telegram_service.configure(settings.TELEGRAM_TOKEN, settings.TELEGRAM_CHAT_ID)
telegram_service.configure(
settings.TELEGRAM_TOKEN, settings.TELEGRAM_CHAT_ID
)
telegram_service.send_start_notification()
logger.info("Telegram notifications enabled")
except Exception as e:
logger.error(f"Failed to configure Telegram: {e}")
# Configure Notion if credentials available
if settings.has_notion_config:
try:
from services.notion_service import notion_service
notion_service.configure(
settings.NOTION_API_TOKEN, settings.NOTION_DATABASE_ID
)
logger.info("✅ Notion integration enabled")
except Exception as e:
logger.error(f"Failed to configure Notion: {e}")
else:
logger.info("Notion not configured - upload to Notion disabled")
# Initialize WebDAV if configured
if settings.has_webdav_config:
try:
@@ -205,7 +225,7 @@ def initialize_services() -> None:
logger.exception("WebDAV initialization error details")
else:
logger.info("Skipping WebDAV initialization (not configured)")
# Initialize VRAM manager
try:
vram_manager.initialize()
@@ -213,7 +233,7 @@ def initialize_services() -> None:
except Exception as e:
logger.error(f"Failed to initialize VRAM manager: {e}")
logger.exception("VRAM manager initialization error details")
# Initialize processed registry
try:
processed_registry.initialize()
@@ -221,11 +241,11 @@ def initialize_services() -> None:
except Exception as e:
logger.error(f"Failed to initialize processed registry: {e}")
logger.exception("Registry initialization error details")
# Run health check
health = check_service_health()
logger.info(f"Initial health check: {json.dumps(health, indent=2)}")
logger.info("All services initialized successfully")
@@ -233,6 +253,7 @@ def send_error_notification(error_type: str, error_message: str) -> None:
"""Send error notification via Telegram"""
try:
from services.telegram_service import telegram_service
if telegram_service.is_configured:
telegram_service.send_error_notification(error_type, error_message)
except Exception as e:
@@ -243,15 +264,16 @@ def run_dashboard_thread() -> None:
"""Run Flask dashboard in a separate thread"""
try:
from api.routes import create_app
app = create_app()
# Run Flask in production mode with threaded=True
app.run(
host='0.0.0.0',
host="0.0.0.0",
port=5000,
debug=False,
threaded=True,
use_reloader=False # Important: disable reloader in thread
use_reloader=False, # Important: disable reloader in thread
)
except Exception as e:
logger.error(f"Dashboard thread error: {e}")
@@ -260,14 +282,12 @@ def run_dashboard_thread() -> None:
def start_dashboard() -> threading.Thread:
"""Start dashboard in a background daemon thread"""
dashboard_port = int(os.getenv('DASHBOARD_PORT', '5000'))
dashboard_port = int(os.getenv("DASHBOARD_PORT", "5000"))
logger.info(f"Starting dashboard on port {dashboard_port}...")
# Create daemon thread so it doesn't block shutdown
dashboard_thread = threading.Thread(
target=run_dashboard_thread,
name="DashboardThread",
daemon=True
target=run_dashboard_thread, name="DashboardThread", daemon=True
)
dashboard_thread.start()
logger.info(f"Dashboard thread started (Thread-ID: {dashboard_thread.ident})")
@@ -282,109 +302,169 @@ def run_main_loop() -> None:
from processors.audio_processor import AudioProcessor
from processors.pdf_processor import PDFProcessor
from processors.text_processor import TextProcessor
audio_processor = AudioProcessor()
pdf_processor = PDFProcessor()
text_processor = TextProcessor()
consecutive_errors = 0
max_consecutive_errors = 5
while True:
try:
logger.info("--- Polling for new files ---")
processed_registry.load()
# Process PDFs
if settings.has_webdav_config:
try:
webdav_service.mkdir(settings.REMOTE_PDF_FOLDER)
pdf_files = webdav_service.list(settings.REMOTE_PDF_FOLDER)
for file_path in pdf_files:
if file_path.lower().endswith('.pdf'):
if file_path.lower().endswith(".pdf"):
if not processed_registry.is_processed(file_path):
pdf_processor.process(file_path)
from pathlib import Path
from urllib.parse import unquote
from services.telegram_service import telegram_service
local_filename = unquote(Path(file_path).name)
base_name = Path(local_filename).stem
local_path = (
settings.LOCAL_DOWNLOADS_PATH / local_filename
)
settings.LOCAL_DOWNLOADS_PATH.mkdir(
parents=True, exist_ok=True
)
# Step 1: Notify and download
telegram_service.send_message(
f"📄 Nuevo PDF detectado: {local_filename}\n"
f"⬇️ Descargando..."
)
logger.info(
f"Downloading PDF: {file_path} -> {local_path}"
)
webdav_service.download(file_path, local_path)
# Step 2: Process PDF
telegram_service.send_message(
f"🔍 Procesando PDF con OCR..."
)
pdf_processor.process(str(local_path))
processed_registry.save(file_path)
except Exception as e:
logger.exception(f"Error processing PDFs: {e}")
send_error_notification("pdf_processing", str(e))
# Process Audio files
if settings.has_webdav_config:
try:
audio_files = webdav_service.list(settings.REMOTE_AUDIOS_FOLDER)
for file_path in audio_files:
if any(file_path.lower().endswith(ext) for ext in settings.AUDIO_EXTENSIONS):
if any(
file_path.lower().endswith(ext)
for ext in settings.AUDIO_EXTENSIONS
):
if not processed_registry.is_processed(file_path):
from pathlib import Path
from urllib.parse import unquote
from document.generators import DocumentGenerator
from services.telegram_service import telegram_service
local_filename = unquote(Path(file_path).name)
base_name = Path(local_filename).stem
local_path = settings.LOCAL_DOWNLOADS_PATH / local_filename
settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True)
local_path = (
settings.LOCAL_DOWNLOADS_PATH / local_filename
)
settings.LOCAL_DOWNLOADS_PATH.mkdir(
parents=True, exist_ok=True
)
# Step 1: Notify and download
telegram_service.send_message(
f"🎵 Nuevo audio detectado: {local_filename}\n"
f"⬇️ Descargando..."
)
logger.info(f"Downloading audio: {file_path} -> {local_path}")
logger.info(
f"Downloading audio: {file_path} -> {local_path}"
)
webdav_service.download(file_path, local_path)
# Step 2: Transcribe
telegram_service.send_message(f"📝 Transcribiendo audio con Whisper...")
telegram_service.send_message(
f"📝 Transcribiendo audio con Whisper..."
)
result = audio_processor.process(str(local_path))
if result.get("success") and result.get("transcription_path"):
transcription_file = Path(result["transcription_path"])
transcription_text = result.get("text", "")
# Step 3: Generate AI summary and documents
telegram_service.send_message(f"🤖 Generando resumen con IA...")
doc_generator = DocumentGenerator()
success, summary, output_files = doc_generator.generate_summary(
transcription_text, base_name
if result.get("success") and result.get(
"transcription_path"
):
transcription_file = Path(
result["transcription_path"]
)
transcription_text = result.get("text", "")
# Step 3: Generate AI summary and documents
telegram_service.send_message(
f"🤖 Generando resumen con IA..."
)
doc_generator = DocumentGenerator()
success, summary, output_files = (
doc_generator.generate_summary(
transcription_text, base_name
)
)
# Step 4: Upload all files to Nextcloud
if success and output_files:
# Create folders
for folder in [settings.RESUMENES_FOLDER, settings.DOCX_FOLDER]:
for folder in [
settings.RESUMENES_FOLDER,
settings.DOCX_FOLDER,
]:
try:
webdav_service.makedirs(folder)
except Exception:
pass
# Upload transcription TXT
if transcription_file.exists():
remote_txt = f"{settings.RESUMENES_FOLDER}/{transcription_file.name}"
webdav_service.upload(transcription_file, remote_txt)
webdav_service.upload(
transcription_file, remote_txt
)
logger.info(f"Uploaded: {remote_txt}")
# Upload DOCX
docx_path = Path(output_files.get('docx_path', ''))
docx_path = Path(
output_files.get("docx_path", "")
)
if docx_path.exists():
remote_docx = f"{settings.DOCX_FOLDER}/{docx_path.name}"
webdav_service.upload(docx_path, remote_docx)
webdav_service.upload(
docx_path, remote_docx
)
logger.info(f"Uploaded: {remote_docx}")
# Upload PDF
pdf_path = Path(output_files.get('pdf_path', ''))
pdf_path = Path(
output_files.get("pdf_path", "")
)
if pdf_path.exists():
remote_pdf = f"{settings.DOCX_FOLDER}/{pdf_path.name}"
webdav_service.upload(pdf_path, remote_pdf)
logger.info(f"Uploaded: {remote_pdf}")
# Upload Markdown
md_path = Path(output_files.get('markdown_path', ''))
md_path = Path(
output_files.get("markdown_path", "")
)
if md_path.exists():
remote_md = f"{settings.RESUMENES_FOLDER}/{md_path.name}"
webdav_service.upload(md_path, remote_md)
logger.info(f"Uploaded: {remote_md}")
# Final notification
telegram_service.send_message(
f"✅ Audio procesado: {local_filename}\n"
@@ -396,46 +476,53 @@ def run_main_loop() -> None:
# Just upload transcription if summary failed
if transcription_file.exists():
try:
webdav_service.makedirs(settings.RESUMENES_FOLDER)
webdav_service.makedirs(
settings.RESUMENES_FOLDER
)
except Exception:
pass
remote_txt = f"{settings.RESUMENES_FOLDER}/{transcription_file.name}"
webdav_service.upload(transcription_file, remote_txt)
webdav_service.upload(
transcription_file, remote_txt
)
telegram_service.send_message(
f"⚠️ Resumen fallido, solo transcripción subida:\n{transcription_file.name}"
)
processed_registry.save(file_path)
except Exception as e:
logger.exception(f"Error processing audio: {e}")
send_error_notification("audio_processing", str(e))
# Process Text files
if settings.has_webdav_config:
try:
text_files = webdav_service.list(settings.REMOTE_TXT_FOLDER)
for file_path in text_files:
if any(file_path.lower().endswith(ext) for ext in settings.TXT_EXTENSIONS):
if any(
file_path.lower().endswith(ext)
for ext in settings.TXT_EXTENSIONS
):
if not processed_registry.is_processed(file_path):
text_processor.process(file_path)
processed_registry.save(file_path)
except Exception as e:
logger.exception(f"Error processing text: {e}")
send_error_notification("text_processing", str(e))
# Reset error counter on success
consecutive_errors = 0
except Exception as e:
# Improved error logging with full traceback
logger.exception(f"Critical error in main loop: {e}")
# Send notification for critical errors
send_error_notification("main_loop", str(e))
# Track consecutive errors
consecutive_errors += 1
if consecutive_errors >= max_consecutive_errors:
logger.critical(
f"Too many consecutive errors ({consecutive_errors}). "
@@ -443,14 +530,14 @@ def run_main_loop() -> None:
)
send_error_notification(
"consecutive_errors",
f"Service has failed {consecutive_errors} consecutive times"
f"Service has failed {consecutive_errors} consecutive times",
)
# Don't exit, let the loop continue with backoff
logger.info(f"Waiting {settings.POLL_INTERVAL * 2} seconds before retry...")
time.sleep(settings.POLL_INTERVAL * 2)
continue
logger.info(f"Cycle completed. Waiting {settings.POLL_INTERVAL} seconds...")
time.sleep(settings.POLL_INTERVAL)
@@ -462,7 +549,9 @@ def main():
try:
logger.info("=== CBCFacil Service Started ===")
logger.info(f"Version: {os.getenv('APP_VERSION', '8.0')}")
logger.info(f"Environment: {'production' if os.getenv('DEBUG', 'false').lower() != 'true' else 'development'}")
logger.info(
f"Environment: {'production' if os.getenv('DEBUG', 'false').lower() != 'true' else 'development'}"
)
lock_fd = acquire_lock()
initialize_services()
@@ -472,7 +561,7 @@ def main():
# Run main processing loop
run_main_loop()
except KeyboardInterrupt:
logger.info("Shutdown requested by user")
except Exception as e:
@@ -491,12 +580,15 @@ if __name__ == "__main__":
command = sys.argv[1]
if command == "whisper" and len(sys.argv) == 4:
from processors.audio_processor import AudioProcessor
AudioProcessor().process(sys.argv[2])
elif command == "pdf" and len(sys.argv) == 4:
from processors.pdf_processor import PDFProcessor
PDFProcessor().process(sys.argv[2])
elif command == "health":
from main import check_service_health
health = check_service_health()
print(json.dumps(health, indent=2))
else: