Cambios principales: ## Nuevos archivos - services/ai/parallel_provider.py: Ejecución paralela de múltiples proveedores AI - services/ai/prompt_manager.py: Gestión centralizada de prompts (resumen.md como fuente) - latex/resumen.md: Template del prompt para resúmenes académicos LaTeX ## Mejoras en generación LaTeX (document/generators.py) - Nueva función _sanitize_latex(): Corrige automáticamente errores comunes de AI - Agrega align=center a nodos TikZ con saltos de línea (\\) - Previene errores 'Not allowed in LR mode' antes de compilar - Soporte para procesamiento paralelo de proveedores AI - Conversión DOCX en paralelo con generación PDF - Uploads a Notion en background (non-blocking) - Callbacks de notificación para progreso en Telegram ## Mejoras en proveedores AI - claude_provider.py: fix_latex() con instrucciones específicas para errores TikZ - gemini_provider.py: fix_latex() mejorado + rate limiting + circuit breaker - provider_factory.py: Soporte para parallel provider ## Otros cambios - config/settings.py: Nuevas configuraciones para Gemini models - services/webdav_service.py: Mejoras en manejo de conexión - .gitignore: Ignora archivos LaTeX auxiliares (.aux, .toc, .out, .pdf) ## Archivos de ejemplo - latex/imperio_romano.tex, latex/clase_revolucion_rusa_crisis_30.tex - resumen_curiosidades.tex (corregido y compilado exitosamente)
603 lines
24 KiB
Python
603 lines
24 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
CBCFacil - Main Service Entry Point
|
|
Unified AI service for document processing (audio, PDF, text)
|
|
"""
|
|
|
|
import logging
|
|
import sys
|
|
import time
|
|
import fcntl
|
|
import os
|
|
import json
|
|
import threading
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
# Load environment variables from .env file
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
|
|
# Configure logging with JSON formatter for production
|
|
class JSONFormatter(logging.Formatter):
|
|
"""JSON formatter for structured logging in production"""
|
|
|
|
def format(self, record):
|
|
log_entry = {
|
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
|
"level": record.levelname,
|
|
"message": record.getMessage(),
|
|
"module": record.module,
|
|
"function": record.funcName,
|
|
"line": record.lineno,
|
|
}
|
|
|
|
# Add exception info if present
|
|
if record.exc_info:
|
|
log_entry["exception"] = self.formatException(record.exc_info)
|
|
|
|
return json.dumps(log_entry)
|
|
|
|
|
|
def setup_logging() -> logging.Logger:
|
|
"""Setup logging configuration"""
|
|
from config import settings
|
|
|
|
# Create logger
|
|
logger = logging.getLogger(__name__)
|
|
logger.setLevel(getattr(logging, settings.LOG_LEVEL.upper()))
|
|
|
|
# Remove existing handlers
|
|
logger.handlers.clear()
|
|
|
|
# Console handler
|
|
console_handler = logging.StreamHandler(sys.stdout)
|
|
if settings.is_production:
|
|
console_handler.setFormatter(JSONFormatter())
|
|
else:
|
|
console_handler.setFormatter(
|
|
logging.Formatter("%(asctime)s [%(levelname)s] - %(name)s - %(message)s")
|
|
)
|
|
logger.addHandler(console_handler)
|
|
|
|
# File handler if configured
|
|
if settings.LOG_FILE:
|
|
file_handler = logging.FileHandler(settings.LOG_FILE)
|
|
file_handler.setFormatter(JSONFormatter())
|
|
logger.addHandler(file_handler)
|
|
|
|
return logger
|
|
|
|
|
|
logger = setup_logging()
|
|
|
|
|
|
def acquire_lock() -> int:
|
|
"""Acquire single instance lock"""
|
|
lock_file = (
|
|
Path(os.getenv("LOCAL_STATE_DIR", str(Path(__file__).parent)))
|
|
/ ".main_service.lock"
|
|
)
|
|
lock_file.parent.mkdir(parents=True, exist_ok=True)
|
|
lock_fd = open(lock_file, "w")
|
|
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
lock_fd.write(str(os.getpid()))
|
|
lock_fd.flush()
|
|
logger.info(f"Lock acquired. PID: {os.getpid()}")
|
|
return lock_fd
|
|
|
|
|
|
def release_lock(lock_fd) -> None:
|
|
"""Release lock"""
|
|
try:
|
|
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_UN)
|
|
lock_fd.close()
|
|
except Exception as e:
|
|
logger.warning(f"Could not release lock: {e}")
|
|
|
|
|
|
def validate_configuration() -> None:
|
|
"""Validate configuration at startup"""
|
|
from config.validators import validate_environment, ConfigurationError
|
|
|
|
try:
|
|
warnings = validate_environment()
|
|
if warnings:
|
|
logger.info(
|
|
f"Configuration validation completed with {len(warnings)} warnings"
|
|
)
|
|
except ConfigurationError as e:
|
|
logger.error(f"Configuration validation failed: {e}")
|
|
raise
|
|
|
|
|
|
def check_service_health() -> dict:
|
|
"""
|
|
Check health of all external services
|
|
Returns dict with health status
|
|
"""
|
|
from config import settings
|
|
from services.webdav_service import webdav_service
|
|
|
|
health_status = {
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"status": "healthy",
|
|
"services": {},
|
|
}
|
|
|
|
# Check WebDAV
|
|
try:
|
|
if settings.has_webdav_config:
|
|
# Try a simple operation
|
|
webdav_service.list(".")
|
|
health_status["services"]["webdav"] = {"status": "healthy"}
|
|
else:
|
|
health_status["services"]["webdav"] = {"status": "not_configured"}
|
|
except Exception as e:
|
|
health_status["services"]["webdav"] = {"status": "unhealthy", "error": str(e)}
|
|
health_status["status"] = "degraded"
|
|
|
|
# Check Telegram
|
|
try:
|
|
from services.telegram_service import telegram_service
|
|
|
|
if telegram_service.is_configured:
|
|
health_status["services"]["telegram"] = {"status": "healthy"}
|
|
else:
|
|
health_status["services"]["telegram"] = {"status": "not_configured"}
|
|
except Exception as e:
|
|
health_status["services"]["telegram"] = {
|
|
"status": "unavailable",
|
|
"error": str(e),
|
|
}
|
|
|
|
# Check VRAM manager
|
|
try:
|
|
from services.vram_manager import vram_manager
|
|
|
|
vram_info = vram_manager.get_vram_info()
|
|
health_status["services"]["vram"] = {
|
|
"status": "healthy",
|
|
"available_gb": vram_info.get("free", 0) / (1024**3),
|
|
}
|
|
except Exception as e:
|
|
health_status["services"]["vram"] = {"status": "unavailable", "error": str(e)}
|
|
|
|
return health_status
|
|
|
|
|
|
def initialize_services() -> None:
|
|
"""Initialize all services with configuration validation"""
|
|
from config import settings
|
|
from services.webdav_service import webdav_service
|
|
from services.vram_manager import vram_manager
|
|
from services.telegram_service import telegram_service
|
|
from storage.processed_registry import processed_registry
|
|
|
|
logger.info("Initializing services...")
|
|
|
|
# Validate configuration
|
|
validate_configuration()
|
|
|
|
# Warn if WebDAV not configured
|
|
if not settings.has_webdav_config:
|
|
logger.warning("WebDAV not configured - file sync functionality disabled")
|
|
|
|
# Warn if AI providers not configured
|
|
if not settings.has_ai_config:
|
|
logger.warning("AI providers not configured - summary generation will not work")
|
|
|
|
# Configure Telegram if credentials available
|
|
if settings.TELEGRAM_TOKEN and settings.TELEGRAM_CHAT_ID:
|
|
try:
|
|
telegram_service.configure(
|
|
settings.TELEGRAM_TOKEN, settings.TELEGRAM_CHAT_ID
|
|
)
|
|
telegram_service.send_start_notification()
|
|
logger.info("Telegram notifications enabled")
|
|
except Exception as e:
|
|
logger.error(f"Failed to configure Telegram: {e}")
|
|
|
|
# Configure Notion if credentials available
|
|
if settings.has_notion_config:
|
|
try:
|
|
from services.notion_service import notion_service
|
|
|
|
notion_service.configure(
|
|
settings.NOTION_API_TOKEN, settings.NOTION_DATABASE_ID
|
|
)
|
|
logger.info("✅ Notion integration enabled")
|
|
except Exception as e:
|
|
logger.error(f"Failed to configure Notion: {e}")
|
|
else:
|
|
logger.info("Notion not configured - upload to Notion disabled")
|
|
|
|
# Initialize WebDAV if configured
|
|
if settings.has_webdav_config:
|
|
try:
|
|
webdav_service.initialize()
|
|
logger.info("WebDAV service initialized")
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize WebDAV: {e}")
|
|
logger.exception("WebDAV initialization error details")
|
|
else:
|
|
logger.info("Skipping WebDAV initialization (not configured)")
|
|
|
|
# Initialize VRAM manager
|
|
try:
|
|
vram_manager.initialize()
|
|
logger.info("VRAM manager initialized")
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize VRAM manager: {e}")
|
|
logger.exception("VRAM manager initialization error details")
|
|
|
|
# Initialize processed registry
|
|
try:
|
|
processed_registry.initialize()
|
|
logger.info("Processed registry initialized")
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize processed registry: {e}")
|
|
logger.exception("Registry initialization error details")
|
|
|
|
# Run health check
|
|
health = check_service_health()
|
|
logger.info(f"Initial health check: {json.dumps(health, indent=2)}")
|
|
|
|
logger.info("All services initialized successfully")
|
|
|
|
|
|
def send_error_notification(error_type: str, error_message: str) -> None:
|
|
"""Send error notification via Telegram"""
|
|
try:
|
|
from services.telegram_service import telegram_service
|
|
|
|
if telegram_service.is_configured:
|
|
telegram_service.send_error_notification(error_type, error_message)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to send error notification: {e}")
|
|
|
|
|
|
def run_dashboard_thread() -> None:
|
|
"""Run Flask dashboard in a separate thread"""
|
|
try:
|
|
from api.routes import create_app
|
|
|
|
app = create_app()
|
|
|
|
# Run Flask in production mode with threaded=True
|
|
app.run(
|
|
host="0.0.0.0",
|
|
port=5000,
|
|
debug=False,
|
|
threaded=True,
|
|
use_reloader=False, # Important: disable reloader in thread
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Dashboard thread error: {e}")
|
|
logger.exception("Dashboard thread exception details")
|
|
|
|
|
|
def start_dashboard() -> threading.Thread:
|
|
"""Start dashboard in a background daemon thread"""
|
|
dashboard_port = int(os.getenv("DASHBOARD_PORT", "5000"))
|
|
logger.info(f"Starting dashboard on port {dashboard_port}...")
|
|
|
|
# Create daemon thread so it doesn't block shutdown
|
|
dashboard_thread = threading.Thread(
|
|
target=run_dashboard_thread, name="DashboardThread", daemon=True
|
|
)
|
|
dashboard_thread.start()
|
|
logger.info(f"Dashboard thread started (Thread-ID: {dashboard_thread.ident})")
|
|
return dashboard_thread
|
|
|
|
|
|
def run_main_loop() -> None:
|
|
"""Main processing loop with improved error handling"""
|
|
from config import settings
|
|
from services.webdav_service import webdav_service
|
|
from storage.processed_registry import processed_registry
|
|
from processors.audio_processor import AudioProcessor
|
|
from processors.pdf_processor import PDFProcessor
|
|
from processors.text_processor import TextProcessor
|
|
|
|
audio_processor = AudioProcessor()
|
|
pdf_processor = PDFProcessor()
|
|
text_processor = TextProcessor()
|
|
|
|
consecutive_errors = 0
|
|
max_consecutive_errors = 5
|
|
|
|
while True:
|
|
try:
|
|
logger.info("--- Polling for new files ---")
|
|
processed_registry.load()
|
|
|
|
# Process PDFs
|
|
if settings.has_webdav_config:
|
|
try:
|
|
webdav_service.mkdir(settings.REMOTE_PDF_FOLDER)
|
|
pdf_files = webdav_service.list(settings.REMOTE_PDF_FOLDER)
|
|
for file_path in pdf_files:
|
|
if file_path.lower().endswith(".pdf"):
|
|
if not processed_registry.is_processed(file_path):
|
|
from pathlib import Path
|
|
from urllib.parse import unquote
|
|
from services.telegram_service import telegram_service
|
|
|
|
local_filename = unquote(Path(file_path).name)
|
|
base_name = Path(local_filename).stem
|
|
local_path = (
|
|
settings.LOCAL_DOWNLOADS_PATH / local_filename
|
|
)
|
|
settings.LOCAL_DOWNLOADS_PATH.mkdir(
|
|
parents=True, exist_ok=True
|
|
)
|
|
|
|
# Step 1: Notify and download
|
|
telegram_service.send_message(
|
|
f"📄 Nuevo PDF detectado: {local_filename}\n"
|
|
f"⬇️ Descargando..."
|
|
)
|
|
logger.info(
|
|
f"Downloading PDF: {file_path} -> {local_path}"
|
|
)
|
|
webdav_service.download(file_path, local_path)
|
|
|
|
# Step 2: Process PDF
|
|
telegram_service.send_message(
|
|
f"🔍 Procesando PDF con OCR..."
|
|
)
|
|
pdf_processor.process(str(local_path))
|
|
|
|
processed_registry.save(file_path)
|
|
except Exception as e:
|
|
logger.exception(f"Error processing PDFs: {e}")
|
|
send_error_notification("pdf_processing", str(e))
|
|
|
|
# Process Audio files
|
|
if settings.has_webdav_config:
|
|
try:
|
|
audio_files = webdav_service.list(settings.REMOTE_AUDIOS_FOLDER)
|
|
for file_path in audio_files:
|
|
if any(
|
|
file_path.lower().endswith(ext)
|
|
for ext in settings.AUDIO_EXTENSIONS
|
|
):
|
|
if not processed_registry.is_processed(file_path):
|
|
from pathlib import Path
|
|
from urllib.parse import unquote
|
|
from document.generators import DocumentGenerator
|
|
from services.telegram_service import telegram_service
|
|
|
|
local_filename = unquote(Path(file_path).name)
|
|
base_name = Path(local_filename).stem
|
|
local_path = (
|
|
settings.LOCAL_DOWNLOADS_PATH / local_filename
|
|
)
|
|
settings.LOCAL_DOWNLOADS_PATH.mkdir(
|
|
parents=True, exist_ok=True
|
|
)
|
|
|
|
# Step 1: Notify and download
|
|
telegram_service.send_message(
|
|
f"🎵 Nuevo audio detectado: {local_filename}\n"
|
|
f"⬇️ Descargando..."
|
|
)
|
|
logger.info(
|
|
f"Downloading audio: {file_path} -> {local_path}"
|
|
)
|
|
webdav_service.download(file_path, local_path)
|
|
|
|
# Step 2: Transcribe
|
|
telegram_service.send_message(
|
|
f"📝 Transcribiendo audio con Whisper..."
|
|
)
|
|
result = audio_processor.process(str(local_path))
|
|
|
|
if result.get("success") and result.get(
|
|
"transcription_path"
|
|
):
|
|
transcription_file = Path(
|
|
result["transcription_path"]
|
|
)
|
|
transcription_text = result.get("text", "")
|
|
|
|
# Step 3: Generate AI summary and documents
|
|
telegram_service.send_message(
|
|
f"🤖 Generando resumen académico LaTeX..."
|
|
)
|
|
doc_generator = DocumentGenerator(
|
|
notification_callback=lambda msg: telegram_service.send_message(msg)
|
|
)
|
|
success, summary, output_files = (
|
|
doc_generator.generate_summary(
|
|
transcription_text, base_name
|
|
)
|
|
)
|
|
|
|
# Step 4: Upload all files to Nextcloud
|
|
if success and output_files:
|
|
# Create folders
|
|
for folder in [
|
|
settings.RESUMENES_FOLDER,
|
|
settings.DOCX_FOLDER,
|
|
]:
|
|
try:
|
|
webdav_service.makedirs(folder)
|
|
except Exception:
|
|
pass
|
|
|
|
# Upload all files in parallel using batch upload
|
|
upload_tasks = []
|
|
|
|
# Upload transcription TXT
|
|
if transcription_file.exists():
|
|
remote_txt = f"{settings.RESUMENES_FOLDER}/{transcription_file.name}"
|
|
upload_tasks.append((transcription_file, remote_txt))
|
|
|
|
# Upload DOCX
|
|
docx_path = Path(
|
|
output_files.get("docx_path", "")
|
|
)
|
|
if docx_path.exists():
|
|
remote_docx = f"{settings.DOCX_FOLDER}/{docx_path.name}"
|
|
upload_tasks.append((docx_path, remote_docx))
|
|
|
|
# Upload PDF
|
|
pdf_path = Path(
|
|
output_files.get("pdf_path", "")
|
|
)
|
|
if pdf_path.exists():
|
|
remote_pdf = f"{settings.DOCX_FOLDER}/{pdf_path.name}"
|
|
upload_tasks.append((pdf_path, remote_pdf))
|
|
|
|
# Upload Markdown
|
|
md_path = Path(
|
|
output_files.get("markdown_path", "")
|
|
)
|
|
if md_path.exists():
|
|
remote_md = f"{settings.RESUMENES_FOLDER}/{md_path.name}"
|
|
upload_tasks.append((md_path, remote_md))
|
|
|
|
# Execute parallel uploads
|
|
if upload_tasks:
|
|
upload_results = webdav_service.upload_batch(
|
|
upload_tasks, max_workers=4, timeout=120
|
|
)
|
|
logger.info(f"Parallel upload complete: {len(upload_results)} files")
|
|
|
|
# Final notification
|
|
telegram_service.send_message(
|
|
f"✅ Audio procesado: {local_filename}\n"
|
|
f"📄 DOCX: {docx_path.name if docx_path.exists() else 'N/A'}\n"
|
|
f"📑 PDF: {pdf_path.name if pdf_path.exists() else 'N/A'}\n"
|
|
f"☁️ Subido a Nextcloud"
|
|
)
|
|
else:
|
|
# Just upload transcription if summary failed
|
|
if transcription_file.exists():
|
|
try:
|
|
webdav_service.makedirs(
|
|
settings.RESUMENES_FOLDER
|
|
)
|
|
except Exception:
|
|
pass
|
|
remote_txt = f"{settings.RESUMENES_FOLDER}/{transcription_file.name}"
|
|
webdav_service.upload(
|
|
transcription_file, remote_txt
|
|
)
|
|
telegram_service.send_message(
|
|
f"⚠️ Resumen fallido, solo transcripción subida:\n{transcription_file.name}"
|
|
)
|
|
|
|
processed_registry.save(file_path)
|
|
except Exception as e:
|
|
logger.exception(f"Error processing audio: {e}")
|
|
send_error_notification("audio_processing", str(e))
|
|
|
|
# Process Text files
|
|
if settings.has_webdav_config:
|
|
try:
|
|
text_files = webdav_service.list(settings.REMOTE_TXT_FOLDER)
|
|
for file_path in text_files:
|
|
if any(
|
|
file_path.lower().endswith(ext)
|
|
for ext in settings.TXT_EXTENSIONS
|
|
):
|
|
if not processed_registry.is_processed(file_path):
|
|
text_processor.process(file_path)
|
|
processed_registry.save(file_path)
|
|
except Exception as e:
|
|
logger.exception(f"Error processing text: {e}")
|
|
send_error_notification("text_processing", str(e))
|
|
|
|
# Reset error counter on success
|
|
consecutive_errors = 0
|
|
|
|
except Exception as e:
|
|
# Improved error logging with full traceback
|
|
logger.exception(f"Critical error in main loop: {e}")
|
|
|
|
# Send notification for critical errors
|
|
send_error_notification("main_loop", str(e))
|
|
|
|
# Track consecutive errors
|
|
consecutive_errors += 1
|
|
|
|
if consecutive_errors >= max_consecutive_errors:
|
|
logger.critical(
|
|
f"Too many consecutive errors ({consecutive_errors}). "
|
|
"Service may be unstable. Consider checking configuration."
|
|
)
|
|
send_error_notification(
|
|
"consecutive_errors",
|
|
f"Service has failed {consecutive_errors} consecutive times",
|
|
)
|
|
|
|
# Don't exit, let the loop continue with backoff
|
|
logger.info(f"Waiting {settings.POLL_INTERVAL * 2} seconds before retry...")
|
|
time.sleep(settings.POLL_INTERVAL * 2)
|
|
continue
|
|
|
|
logger.info(f"Cycle completed. Waiting {settings.POLL_INTERVAL} seconds...")
|
|
time.sleep(settings.POLL_INTERVAL)
|
|
|
|
|
|
def main():
|
|
"""Main entry point"""
|
|
lock_fd = None
|
|
dashboard_thread = None
|
|
try:
|
|
logger.info("=== CBCFacil Service Started ===")
|
|
logger.info(f"Version: {os.getenv('APP_VERSION', '8.0')}")
|
|
logger.info(
|
|
f"Environment: {'production' if os.getenv('DEBUG', 'false').lower() != 'true' else 'development'}"
|
|
)
|
|
|
|
lock_fd = acquire_lock()
|
|
initialize_services()
|
|
|
|
# Start dashboard in background thread
|
|
dashboard_thread = start_dashboard()
|
|
|
|
# Run main processing loop
|
|
run_main_loop()
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Shutdown requested by user")
|
|
except Exception as e:
|
|
logger.exception(f"Fatal error in main: {e}")
|
|
send_error_notification("fatal_error", str(e))
|
|
sys.exit(1)
|
|
finally:
|
|
if lock_fd:
|
|
release_lock(lock_fd)
|
|
logger.info("=== CBCFacil Service Stopped ===")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Handle CLI commands
|
|
if len(sys.argv) > 1:
|
|
command = sys.argv[1]
|
|
if command == "whisper" and len(sys.argv) == 4:
|
|
from processors.audio_processor import AudioProcessor
|
|
|
|
AudioProcessor().process(sys.argv[2])
|
|
elif command == "pdf" and len(sys.argv) == 4:
|
|
from processors.pdf_processor import PDFProcessor
|
|
|
|
PDFProcessor().process(sys.argv[2])
|
|
elif command == "health":
|
|
from main import check_service_health
|
|
|
|
health = check_service_health()
|
|
print(json.dumps(health, indent=2))
|
|
else:
|
|
print("Usage: python main.py [whisper|pdf|health]")
|
|
sys.exit(1)
|
|
else:
|
|
main()
|