feat: Sistema CBCFacil completo con cola secuencial

- Implementa ProcessingMonitor singleton para procesamiento secuencial de archivos
- Agrega AI summary service con soporte para MiniMax API
- Agrega PDF generator para resúmenes
- Agrega watchers para monitoreo de carpeta remota
- Mejora sistema de notificaciones Telegram
- Implementa gestión de VRAM para GPU
- Configuración mediante variables de entorno (sin hardcoded secrets)
- .env y transcriptions/ agregados a .gitignore

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
renato97
2026-02-25 15:35:39 +00:00
parent dcf887c510
commit ee8fc183be
77 changed files with 3734 additions and 20263 deletions

View File

@@ -1,261 +1,78 @@
"""
Centralized configuration management for CBCFacil
Configuración centralizada de la aplicación.
Carga variables de entorno desde archivo .env
"""
import os
from pathlib import Path
from typing import Optional, Set, Union
from typing import Optional
from dotenv import load_dotenv
# Load environment variables from .env file
# Cargar variables de entorno
load_dotenv()
class ConfigurationError(Exception):
"""Raised when configuration is invalid"""
pass
class Settings:
"""Application settings loaded from environment variables"""
"""Configuración centralizada de la aplicación."""
# Application
APP_NAME: str = "CBCFacil"
APP_VERSION: str = "8.0"
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
# Nextcloud/WebDAV Configuration
# Nextcloud/WebDAV
NEXTCLOUD_URL: str = os.getenv("NEXTCLOUD_URL", "")
NEXTCLOUD_USER: str = os.getenv("NEXTCLOUD_USER", "")
NEXTCLOUD_PASSWORD: str = os.getenv("NEXTCLOUD_PASSWORD", "")
WEBDAV_ENDPOINT: str = NEXTCLOUD_URL
# Remote folders
REMOTE_AUDIOS_FOLDER: str = "Audios"
REMOTE_DOCX_AUDIO_FOLDER: str = "Documentos"
REMOTE_PDF_FOLDER: str = "Pdf"
REMOTE_TXT_FOLDER: str = "Textos"
RESUMENES_FOLDER: str = "Resumenes"
DOCX_FOLDER: str = "Documentos"
# Local paths
BASE_DIR: Path = Path(__file__).resolve().parent.parent
LOCAL_STATE_DIR: str = os.getenv("LOCAL_STATE_DIR", str(BASE_DIR))
LOCAL_DOWNLOADS_PATH: Path = BASE_DIR / "downloads"
LOCAL_RESUMENES: Path = LOCAL_DOWNLOADS_PATH
LOCAL_DOCX: Path = BASE_DIR / "resumenes_docx"
# Processing
POLL_INTERVAL: int = int(os.getenv("POLL_INTERVAL", "5"))
HTTP_TIMEOUT: int = int(os.getenv("HTTP_TIMEOUT", "30"))
WEBDAV_MAX_RETRIES: int = int(os.getenv("WEBDAV_MAX_RETRIES", "3"))
DOWNLOAD_CHUNK_SIZE: int = int(
os.getenv("DOWNLOAD_CHUNK_SIZE", "65536")
) # 64KB for better performance
MAX_FILENAME_LENGTH: int = int(os.getenv("MAX_FILENAME_LENGTH", "80"))
MAX_FILENAME_BASE_LENGTH: int = int(os.getenv("MAX_FILENAME_BASE_LENGTH", "40"))
MAX_FILENAME_TOPICS_LENGTH: int = int(os.getenv("MAX_FILENAME_TOPICS_LENGTH", "20"))
# File extensions
AUDIO_EXTENSIONS: Set[str] = {".mp3", ".wav", ".m4a", ".ogg", ".aac"}
PDF_EXTENSIONS: Set[str] = {".pdf"}
TXT_EXTENSIONS: Set[str] = {".txt"}
# AI Providers
ZAI_BASE_URL: str = os.getenv("ZAI_BASE_URL", "https://api.z.ai/api/anthropic")
ZAI_DEFAULT_MODEL: str = os.getenv("ZAI_MODEL", "glm-4.6")
ZAI_AUTH_TOKEN: Optional[str] = os.getenv("ANTHROPIC_AUTH_TOKEN") or os.getenv(
"ZAI_AUTH_TOKEN", ""
)
# Notion Integration
NOTION_API_TOKEN: Optional[str] = os.getenv("NOTION_API")
NOTION_DATABASE_ID: Optional[str] = os.getenv("NOTION_DATABASE_ID")
# Gemini
GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
GEMINI_FLASH_MODEL: str = os.getenv("GEMINI_FLASH_MODEL", "gemini-2.5-flash")
GEMINI_PRO_MODEL: str = os.getenv("GEMINI_PRO_MODEL", "gemini-1.5-pro")
# CLI paths
GEMINI_CLI_PATH: Optional[str] = os.getenv("GEMINI_CLI_PATH")
CLAUDE_CLI_PATH: Optional[str] = os.getenv("CLAUDE_CLI_PATH")
# Telegram
# Telegram (opcional)
TELEGRAM_TOKEN: Optional[str] = os.getenv("TELEGRAM_TOKEN")
TELEGRAM_CHAT_ID: Optional[str] = os.getenv("TELEGRAM_CHAT_ID")
# PDF Processing Configuration
CPU_COUNT: int = os.cpu_count() or 1
PDF_MAX_PAGES_PER_CHUNK: int = int(os.getenv("PDF_MAX_PAGES_PER_CHUNK", "2"))
PDF_DPI: int = int(os.getenv("PDF_DPI", "200"))
PDF_RENDER_THREAD_COUNT: int = int(
os.getenv("PDF_RENDER_THREAD_COUNT", str(min(4, CPU_COUNT)))
)
PDF_BATCH_SIZE: int = int(os.getenv("PDF_BATCH_SIZE", "2"))
PDF_TROCR_MAX_BATCH: int = int(
os.getenv("PDF_TROCR_MAX_BATCH", str(PDF_BATCH_SIZE))
)
PDF_TESSERACT_THREADS: int = int(
os.getenv("PDF_TESSERACT_THREADS", str(max(1, min(2, max(1, CPU_COUNT // 3)))))
)
PDF_PREPROCESS_THREADS: int = int(
os.getenv("PDF_PREPROCESS_THREADS", str(PDF_TESSERACT_THREADS))
)
PDF_TEXT_DETECTION_MIN_RATIO: float = float(
os.getenv("PDF_TEXT_DETECTION_MIN_RATIO", "0.6")
)
PDF_TEXT_DETECTION_MIN_AVG_CHARS: int = int(
os.getenv("PDF_TEXT_DETECTION_MIN_AVG_CHARS", "120")
)
# AI Providers (opcional)
GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
DEEPINFRA_API_KEY: Optional[str] = os.getenv("DEEPINFRA_API_KEY")
ANTHROPIC_AUTH_TOKEN: Optional[str] = os.getenv("ANTHROPIC_AUTH_TOKEN")
ANTHROPIC_BASE_URL: Optional[str] = os.getenv("ANTHROPIC_BASE_URL")
ANTHROPIC_MODEL: str = os.getenv("ANTHROPIC_MODEL", "glm-4.7")
# Error handling
ERROR_THROTTLE_SECONDS: int = int(os.getenv("ERROR_THROTTLE_SECONDS", "600"))
# GPU/VRAM Management
MODEL_TIMEOUT_SECONDS: int = int(os.getenv("MODEL_TIMEOUT_SECONDS", "300"))
CUDA_VISIBLE_DEVICES: str = os.getenv("CUDA_VISIBLE_DEVICES", "all")
PYTORCH_CUDA_ALLOC_CONF: str = os.getenv(
"PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:512"
)
# GPU Detection (auto, nvidia, amd, cpu)
GPU_PREFERENCE: str = os.getenv("GPU_PREFERENCE", "auto")
# AMD ROCm HSA override for RX 6000 series (gfx1030)
HSA_OVERRIDE_GFX_VERSION: str = os.getenv("HSA_OVERRIDE_GFX_VERSION", "10.3.0")
# Notion (opcional)
NOTION_API: Optional[str] = os.getenv("NOTION_API")
NOTION_DATABASE_ID: Optional[str] = os.getenv("NOTION_DATABASE_ID")
# Dashboard
DASHBOARD_SECRET_KEY: str = os.getenv("DASHBOARD_SECRET_KEY", "")
DASHBOARD_PORT: int = int(os.getenv("DASHBOARD_PORT", "5000"))
DASHBOARD_HOST: str = os.getenv("DASHBOARD_HOST", "0.0.0.0")
DASHBOARD_PORT: int = int(os.getenv("DASHBOARD_PORT", "5000"))
DASHBOARD_SECRET_KEY: Optional[str] = os.getenv("DASHBOARD_SECRET_KEY")
# Rutas locales
BASE_DIR: Path = Path(__file__).parent.parent
DOWNLOADS_DIR: Path = BASE_DIR / "downloads"
TRANSCRIPTIONS_DIR: Path = BASE_DIR / "transcriptions"
# Whisper
WHISPER_MODEL: str = os.getenv("WHISPER_MODEL", "medium")
WHISPER_DEVICE: str = os.getenv("WHISPER_DEVICE", "auto") # auto, cuda, cpu
WHISPER_LANGUAGE: str = os.getenv("WHISPER_LANGUAGE", "es")
WHISPER_AUTO_UNLOAD_SECONDS: int = int(os.getenv("WHISPER_AUTO_UNLOAD_SECONDS", "300")) # 5 minutos
# Configuración de polling
POLL_INTERVAL: int = int(os.getenv("POLL_INTERVAL", "30")) # segundos
WATCHED_REMOTE_PATH: str = os.getenv("WATCHED_REMOTE_PATH", "/Audios")
# Logging
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
LOG_FILE: Optional[str] = os.getenv("LOG_FILE")
# Threading optimization
OMP_NUM_THREADS: int = int(os.getenv("OMP_NUM_THREADS", "4"))
MKL_NUM_THREADS: int = int(os.getenv("MKL_NUM_THREADS", "4"))
# Parallel Processing Configuration
MAX_PARALLEL_UPLOADS: int = int(os.getenv("MAX_PARALLEL_UPLOADS", "4"))
MAX_PARALLEL_AI_REQUESTS: int = int(os.getenv("MAX_PARALLEL_AI_REQUESTS", "3"))
MAX_PARALLEL_PROCESSING: int = int(os.getenv("MAX_PARALLEL_PROCESSING", "2"))
PARALLEL_AI_STRATEGY: str = os.getenv("PARALLEL_AI_STRATEGY", "race") # race, consensus, majority
BACKGROUND_NOTION_UPLOADS: bool = os.getenv("BACKGROUND_NOTION_UPLOADS", "true").lower() == "true"
# ========================================================================
# PROPERTIES WITH VALIDATION
# ========================================================================
@property
def is_production(self) -> bool:
"""Check if running in production mode"""
return not self.DEBUG
@property
def has_webdav_config(self) -> bool:
"""Check if WebDAV credentials are configured"""
return all([self.NEXTCLOUD_URL, self.NEXTCLOUD_USER, self.NEXTCLOUD_PASSWORD])
"""Verifica si hay configuración de WebDAV."""
return bool(self.NEXTCLOUD_URL and self.NEXTCLOUD_USER and self.NEXTCLOUD_PASSWORD)
@property
def has_ai_config(self) -> bool:
"""Check if AI providers are configured"""
return any(
[
self.ZAI_AUTH_TOKEN,
self.GEMINI_API_KEY,
self.CLAUDE_CLI_PATH,
self.GEMINI_CLI_PATH,
]
)
@property
def has_notion_config(self) -> bool:
"""Check if Notion is configured"""
return bool(self.NOTION_API_TOKEN and self.NOTION_DATABASE_ID)
@property
def processed_files_path(self) -> Path:
"""Get the path to the processed files registry"""
return Path(
os.getenv(
"PROCESSED_FILES_PATH",
str(Path(self.LOCAL_STATE_DIR) / "processed_files.txt"),
)
)
@property
def nextcloud_url(self) -> str:
"""Get Nextcloud URL with validation"""
if not self.NEXTCLOUD_URL and self.is_production:
raise ConfigurationError("NEXTCLOUD_URL is required in production mode")
return self.NEXTCLOUD_URL
@property
def nextcloud_user(self) -> str:
"""Get Nextcloud username with validation"""
if not self.NEXTCLOUD_USER and self.is_production:
raise ConfigurationError("NEXTCLOUD_USER is required in production mode")
return self.NEXTCLOUD_USER
@property
def nextcloud_password(self) -> str:
"""Get Nextcloud password with validation"""
if not self.NEXTCLOUD_PASSWORD and self.is_production:
raise ConfigurationError(
"NEXTCLOUD_PASSWORD is required in production mode"
)
return self.NEXTCLOUD_PASSWORD
@property
def valid_webdav_config(self) -> bool:
"""Validate WebDAV configuration completeness"""
try:
_ = self.nextcloud_url
_ = self.nextcloud_user
_ = self.nextcloud_password
return True
except ConfigurationError:
return False
@property
def telegram_configured(self) -> bool:
"""Check if Telegram is properly configured"""
def has_telegram_config(self) -> bool:
"""Verifica si hay configuración de Telegram."""
return bool(self.TELEGRAM_TOKEN and self.TELEGRAM_CHAT_ID)
@property
def has_gpu_support(self) -> bool:
"""Check if GPU support is available"""
try:
import torch
return torch.cuda.is_available()
except ImportError:
return False
@property
def environment_type(self) -> str:
"""Get environment type as string"""
return "production" if self.is_production else "development"
@property
def config_summary(self) -> dict:
"""Get configuration summary for logging"""
return {
"app_name": self.APP_NAME,
"version": self.APP_VERSION,
"environment": self.environment_type,
"debug": self.DEBUG,
"webdav_configured": self.has_webdav_config,
"ai_configured": self.has_ai_config,
"telegram_configured": self.telegram_configured,
"gpu_support": self.has_gpu_support,
"cpu_count": self.CPU_COUNT,
"poll_interval": self.POLL_INTERVAL,
}
def is_production(self) -> bool:
"""Verifica si está en modo producción."""
return os.getenv("ENV", "development") == "production"
# Create global settings instance
# Instancia global de configuración
settings = Settings()