3184 lines
125 KiB
Python
3184 lines
125 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Nextcloud AI Service - Unified Main Service
|
||
Combina todas las funcionalidades de procesamiento de audio, PDF y documentos en un solo archivo.
|
||
"""
|
||
|
||
import fcntl
|
||
import logging
|
||
import os
|
||
import re
|
||
import shutil
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
import unicodedata
|
||
import xml.etree.ElementTree as ET
|
||
from datetime import datetime, timedelta
|
||
from pathlib import Path
|
||
from typing import Dict, Optional, Set
|
||
|
||
import cv2
|
||
import easyocr
|
||
import numpy as np
|
||
import pytesseract
|
||
import requests
|
||
import torch
|
||
import whisper
|
||
import textwrap
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
from docx import Document
|
||
from docx.shared import Inches
|
||
from pdf2image import convert_from_path
|
||
from pypdf import PdfReader, PdfWriter
|
||
from requests.adapters import HTTPAdapter
|
||
from requests.auth import HTTPBasicAuth
|
||
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
||
from reportlab.lib.pagesizes import letter
|
||
from reportlab.pdfgen import canvas
|
||
|
||
# --- CONFIGURACIÓN DE LOGGING ---
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(levelname)s] - %(message)s",
|
||
handlers=[logging.StreamHandler()]
|
||
)
|
||
|
||
# --- CONFIGURACIÓN DE VARIABLES DE ENTORNO ---
|
||
# Cargar variables desde archivo .env si existe
|
||
try:
|
||
from dotenv import load_dotenv
|
||
load_dotenv()
|
||
except ImportError:
|
||
pass
|
||
|
||
NEXTCLOUD_URL = os.getenv("NEXTCLOUD_URL")
|
||
NEXTCLOUD_USER = os.getenv("NEXTCLOUD_USER")
|
||
NEXTCLOUD_PASS = os.getenv("NEXTCLOUD_PASS")
|
||
WEBDAV_ENDPOINT = NEXTCLOUD_URL
|
||
|
||
REMOTE_AUDIOS_FOLDER = "Audios"
|
||
REMOTE_DOCX_AUDIO_FOLDER = "Documentos"
|
||
REMOTE_PDF_FOLDER = "Pdf"
|
||
REMOTE_TXT_FOLDER = "Textos"
|
||
RESUMENES_FOLDER = "Resumenes"
|
||
DOCX_FOLDER = "Documentos"
|
||
|
||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||
LOCAL_STATE_DIR = os.environ.get("LOCAL_STATE_DIR", BASE_DIR)
|
||
LEGACY_PROCESSED_PATHS = ["/app/processed_files.txt"]
|
||
|
||
LOCAL_DOWNLOADS_PATH = os.path.join(BASE_DIR, "downloads")
|
||
LOCAL_RESUMENES = LOCAL_DOWNLOADS_PATH
|
||
LOCAL_DOCX = os.path.join(BASE_DIR, "resumenes_docx")
|
||
POLL_INTERVAL = 5
|
||
PROCESSED_FILES_PATH = os.environ.get(
|
||
"PROCESSED_FILES_PATH",
|
||
os.path.join(LOCAL_STATE_DIR, "processed_files.txt")
|
||
)
|
||
|
||
AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".ogg", ".aac"}
|
||
PDF_EXTENSIONS = {".pdf"}
|
||
TXT_EXTENSIONS = {".txt"}
|
||
|
||
HTTP_TIMEOUT = int(os.getenv("HTTP_TIMEOUT", "30"))
|
||
WEBDAV_MAX_RETRIES = int(os.getenv("WEBDAV_MAX_RETRIES", "3"))
|
||
DOWNLOAD_CHUNK_SIZE = int(os.getenv("DOWNLOAD_CHUNK_SIZE", "8192"))
|
||
MAX_FILENAME_LENGTH = int(os.getenv("MAX_FILENAME_LENGTH", "80"))
|
||
MAX_FILENAME_BASE_LENGTH = int(os.getenv("MAX_FILENAME_BASE_LENGTH", "40"))
|
||
MAX_FILENAME_TOPICS_LENGTH = int(os.getenv("MAX_FILENAME_TOPICS_LENGTH", "20"))
|
||
|
||
ZAI_BASE_URL = os.getenv("ZAI_BASE_URL", "https://api.z.ai/api/anthropic")
|
||
ZAI_DEFAULT_MODEL = os.getenv("ZAI_MODEL", "glm-4.6")
|
||
ZAI_AUTH_TOKEN_FALLBACK = os.getenv(
|
||
"ZAI_AUTH_TOKEN",
|
||
os.getenv("ANTHROPIC_AUTH_TOKEN", "6fef8efda3d24eb9ad3d718daf1ae9a1.RcFc7QPe5uZLr2mS"),
|
||
)
|
||
|
||
_WEBDAV_SESSION: Optional[requests.Session] = None
|
||
ProcessedRegistry = Set[str]
|
||
|
||
# API KEYS
|
||
DEFAULT_GEMINI_API_KEY = "AIzaSyDWOgyAJqscuPU6iSpS6gxupWBm4soNw5o"
|
||
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or DEFAULT_GEMINI_API_KEY
|
||
TELEGRAM_TOKEN = os.getenv("TELEGRAM_TOKEN")
|
||
TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID")
|
||
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://ollama:11434")
|
||
OLLAMA_MODEL = "mistral:7b"
|
||
GEMINI_CLI_PATH = shutil.which("gemini")
|
||
CLAUDE_CLI_PATH = shutil.which("claude")
|
||
GEMINI_FLASH_MODEL = os.getenv("GEMINI_FLASH_MODEL")
|
||
GEMINI_PRO_MODEL = os.getenv("GEMINI_PRO_MODEL")
|
||
|
||
|
||
def _initialize_gemini_model_defaults() -> None:
|
||
"""Selecciona automáticamente los modelos Gemini 2.5 más recientes disponibles."""
|
||
global GEMINI_FLASH_MODEL, GEMINI_PRO_MODEL
|
||
|
||
default_flash = "gemini-2.5-flash"
|
||
default_pro = "gemini-2.5-pro-preview-06-05"
|
||
|
||
if GEMINI_FLASH_MODEL and GEMINI_PRO_MODEL:
|
||
return
|
||
|
||
if not GEMINI_API_KEY:
|
||
GEMINI_FLASH_MODEL = GEMINI_FLASH_MODEL or default_flash
|
||
GEMINI_PRO_MODEL = GEMINI_PRO_MODEL or default_pro
|
||
return
|
||
|
||
try:
|
||
response = requests.get(
|
||
"https://generativelanguage.googleapis.com/v1beta/models",
|
||
params={"key": GEMINI_API_KEY},
|
||
timeout=12,
|
||
)
|
||
response.raise_for_status()
|
||
payload = response.json()
|
||
models = payload.get("models", [])
|
||
|
||
def choose_latest(pattern: str) -> Optional[str]:
|
||
candidate_stable = None
|
||
preview_candidates = []
|
||
|
||
for model_info in models:
|
||
name = model_info.get("name", "")
|
||
if not name.startswith("models/gemini-2.5"):
|
||
continue
|
||
|
||
base_name = name.split("/", 1)[-1]
|
||
|
||
if pattern == "-flash":
|
||
if "-flash" not in base_name or "-flash-lite" in base_name:
|
||
continue
|
||
else:
|
||
if pattern not in base_name:
|
||
continue
|
||
|
||
if "preview" not in base_name and candidate_stable is None:
|
||
candidate_stable = base_name
|
||
else:
|
||
version = model_info.get("version") or ""
|
||
preview_candidates.append((version, base_name))
|
||
|
||
if candidate_stable:
|
||
return candidate_stable
|
||
|
||
if preview_candidates:
|
||
preview_candidates.sort(key=lambda item: item[0], reverse=True)
|
||
return preview_candidates[0][1]
|
||
|
||
return None
|
||
|
||
if not GEMINI_FLASH_MODEL:
|
||
selected_flash = choose_latest("-flash")
|
||
if selected_flash:
|
||
GEMINI_FLASH_MODEL = selected_flash
|
||
|
||
if not GEMINI_PRO_MODEL:
|
||
selected_pro = choose_latest("-pro")
|
||
if selected_pro:
|
||
GEMINI_PRO_MODEL = selected_pro
|
||
|
||
except Exception as exc:
|
||
logging.warning(f"No se pudo obtener la lista de modelos Gemini: {exc}")
|
||
|
||
GEMINI_FLASH_MODEL = GEMINI_FLASH_MODEL or default_flash
|
||
GEMINI_PRO_MODEL = GEMINI_PRO_MODEL or default_pro
|
||
|
||
|
||
_initialize_gemini_model_defaults()
|
||
|
||
GEMINI_AVAILABLE = bool(GEMINI_CLI_PATH or GEMINI_API_KEY or CLAUDE_CLI_PATH)
|
||
|
||
# --- CONFIGURACIÓN DE CARPETAS TEMÁTICAS ---
|
||
TEMATIC_FOLDERS = {
|
||
"historia": "Historia",
|
||
"analisis_contable": "Analisis Contable",
|
||
"instituciones_gobierno": "Instituciones del Gobierno",
|
||
"otras_clases": "Otras Clases"
|
||
}
|
||
|
||
# CONFIGURACIÓN PDF - OPTIMIZADO ADAPTATIVO (GPU/CPU)
|
||
_CPU_COUNT = os.cpu_count() or 1
|
||
MAX_PAGES_PER_CHUNK = max(1, int(os.getenv("PDF_MAX_PAGES_PER_CHUNK", "2"))) # Reducido de 3 a 2
|
||
PDF_DPI = max(150, int(os.getenv("PDF_DPI", "200"))) # Mínimo 150 para calidad legible
|
||
PDF_RENDER_THREAD_COUNT = max(1, int(os.getenv("PDF_RENDER_THREAD_COUNT", str(min(4, _CPU_COUNT))))) # Reducido hilos
|
||
PDF_BATCH_SIZE = max(1, int(os.getenv("PDF_BATCH_SIZE", "2"))) # Reducido de 4 a 2
|
||
PDF_TROCR_MAX_BATCH = max(1, int(os.getenv("PDF_TROCR_MAX_BATCH", str(PDF_BATCH_SIZE))))
|
||
PDF_TESSERACT_THREADS = max(1, int(os.getenv("PDF_TESSERACT_THREADS", str(max(1, min(2, max(1, _CPU_COUNT // 3))))))) # Reducido
|
||
# Reutilizamos los mismos hilos para preprocesamiento y OCR CPU
|
||
PDF_PREPROCESS_THREADS = max(1, int(os.getenv("PDF_PREPROCESS_THREADS", str(PDF_TESSERACT_THREADS))))
|
||
|
||
try:
|
||
PDF_TEXT_DETECTION_MIN_RATIO = float(os.getenv("PDF_TEXT_DETECTION_MIN_RATIO", "0.6"))
|
||
except ValueError:
|
||
PDF_TEXT_DETECTION_MIN_RATIO = 0.6
|
||
|
||
try:
|
||
PDF_TEXT_DETECTION_MIN_AVG_CHARS = int(os.getenv("PDF_TEXT_DETECTION_MIN_AVG_CHARS", "120"))
|
||
except ValueError:
|
||
PDF_TEXT_DETECTION_MIN_AVG_CHARS = 120
|
||
|
||
# ERROR THROTTLING
|
||
ERROR_THROTTLE_SECONDS = int(os.environ.get("ERROR_THROTTLE_SECONDS", "600"))
|
||
_last_error_cache = {}
|
||
|
||
# Caché para modelos con sistema de timeout
|
||
_whisper_model = None
|
||
_ocr_models = None
|
||
_trocr_models = None
|
||
_models_last_used = None
|
||
_MODEL_TIMEOUT_SECONDS = int(os.environ.get("MODEL_TIMEOUT_SECONDS", "300")) # 300 segundos (5 minutos) para liberar más rápido
|
||
|
||
# --- TELEGRAM NOTIFICATION FUNCTIONS ---
|
||
def send_telegram_message(message, retries=3, delay=2):
|
||
"""Envía mensaje a Telegram sin parsing de entidades para evitar errores"""
|
||
if not TELEGRAM_TOKEN or not TELEGRAM_CHAT_ID:
|
||
logging.warning("Telegram token or chat ID not set. Skipping notification.")
|
||
return False
|
||
|
||
url = f"https://api.telegram.org/bot{TELEGRAM_TOKEN}/sendMessage"
|
||
data = {
|
||
"chat_id": TELEGRAM_CHAT_ID,
|
||
"text": message
|
||
}
|
||
|
||
for attempt in range(retries):
|
||
try:
|
||
resp = requests.post(url, data=data, timeout=10)
|
||
if resp.status_code == 200:
|
||
return True
|
||
else:
|
||
logging.error(f"Telegram API error: {resp.status_code} {resp.text}")
|
||
except Exception as e:
|
||
logging.error(f"Telegram notification failed (attempt {attempt+1}/{retries}): {e}")
|
||
time.sleep(delay)
|
||
|
||
logging.error("Telegram notification failed after all retries.")
|
||
return False
|
||
|
||
def should_send_error(key, message):
|
||
"""Return True if we should notify for this (key, message) given throttle rules."""
|
||
now = datetime.utcnow()
|
||
prev = _last_error_cache.get(key)
|
||
if prev is None:
|
||
_last_error_cache[key] = (message, now)
|
||
return True
|
||
prev_msg, prev_time = prev
|
||
if message != prev_msg or (now - prev_time).total_seconds() > ERROR_THROTTLE_SECONDS:
|
||
_last_error_cache[key] = (message, now)
|
||
return True
|
||
return False
|
||
|
||
def _update_models_usage():
|
||
"""Actualiza el timestamp de uso de los modelos"""
|
||
global _models_last_used
|
||
_models_last_used = datetime.utcnow()
|
||
logging.debug(f"Timestamp actualizado: {_models_last_used}")
|
||
|
||
def _check_and_free_vram():
|
||
"""Libera VRAM si los modelos no se han usado en el tiempo especificado"""
|
||
global _whisper_model, _ocr_models, _trocr_models, _models_last_used
|
||
|
||
now = datetime.utcnow()
|
||
|
||
# Limpieza básica sin interrumpir el procesamiento
|
||
if torch.cuda.is_available():
|
||
try:
|
||
# Solo limpiar caché básica sin liberar modelos
|
||
torch.cuda.empty_cache()
|
||
except:
|
||
pass
|
||
|
||
if _models_last_used is None:
|
||
return
|
||
|
||
idle_time = (now - _models_last_used).total_seconds()
|
||
|
||
# Verificar si hay modelos cargados antes de liberar
|
||
models_loaded = _whisper_model is not None or _ocr_models is not None or _trocr_models is not None
|
||
|
||
# Solo liberar después de 10 minutos de inactividad real
|
||
if idle_time > _MODEL_TIMEOUT_SECONDS and models_loaded:
|
||
logging.info(f"🔄 Models idle for {idle_time:.1f}s (> {_MODEL_TIMEOUT_SECONDS}s), freeing VRAM...")
|
||
|
||
models_freed = []
|
||
|
||
# Liberar modelo Whisper
|
||
if _whisper_model is not None:
|
||
try:
|
||
if torch.cuda.is_available():
|
||
del _whisper_model
|
||
_whisper_model = None
|
||
models_freed.append("Whisper")
|
||
except Exception as e:
|
||
logging.error(f"Error freeing Whisper VRAM: {e}")
|
||
|
||
# Liberar modelos OCR
|
||
if _ocr_models is not None:
|
||
try:
|
||
_ocr_models = None
|
||
models_freed.append("OCR")
|
||
except Exception as e:
|
||
logging.error(f"Error freeing OCR VRAM: {e}")
|
||
|
||
# Liberar modelos TrOCR
|
||
if _trocr_models is not None:
|
||
try:
|
||
if torch.cuda.is_available():
|
||
model = _trocr_models.get('model') if isinstance(_trocr_models, dict) else None
|
||
if model is not None:
|
||
model.to('cpu')
|
||
models_freed.append("TrOCR")
|
||
torch.cuda.empty_cache()
|
||
except Exception as e:
|
||
logging.error(f"Error freeing TrOCR VRAM: {e}")
|
||
|
||
# Limpiar variables globales (los modelos se recargarán cuando se necesiten)
|
||
_whisper_model = None
|
||
_ocr_models = None
|
||
_trocr_models = None
|
||
_models_last_used = None
|
||
|
||
# Forzar limpieza agresiva de VRAM
|
||
_force_aggressive_vram_cleanup()
|
||
|
||
if models_freed:
|
||
logging.info(f"🎯 Models freed from GPU: {', '.join(models_freed)}, VRAM liberated")
|
||
|
||
# Mostrar estado actual de VRAM cada 120 segundos para depuración
|
||
elif idle_time % 120 < 10: # Cada ~120 segundos
|
||
vram_status = get_vram_usage()
|
||
if isinstance(vram_status, dict) and vram_status.get('any_models_loaded', False):
|
||
logging.info(f"📊 VRAM Status - Allocated: {vram_status.get('allocated_gb', 0)}GB, Idle: {idle_time:.1f}s")
|
||
|
||
def _force_aggressive_vram_cleanup():
|
||
"""Fuerza una limpieza agresiva de VRAM para liberar toda la memoria posible"""
|
||
try:
|
||
import gc
|
||
|
||
logging.info("🔥 Iniciando limpieza agresiva de VRAM...")
|
||
|
||
if torch.cuda.is_available():
|
||
# Mostrar estado antes de la limpieza
|
||
before_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||
before_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
||
logging.info(f"📊 Antes de limpieza - Allocated: {before_allocated:.2f}GB, Reserved: {before_reserved:.2f}GB")
|
||
|
||
# Estrategia 1: Liberar caché básica
|
||
torch.cuda.empty_cache()
|
||
|
||
# Estrategia 2: Forzar garbage collection múltiple
|
||
for i in range(5):
|
||
gc.collect()
|
||
torch.cuda.empty_cache()
|
||
|
||
# Estrategia 3: Liberar memoria del pool de PyTorch
|
||
if hasattr(torch.cuda, 'memory'):
|
||
try:
|
||
# Intentar liberar el memory pool
|
||
torch.cuda.memory.empty_cache()
|
||
except:
|
||
pass
|
||
|
||
# Estrategia 4: Sincronizar y liberar streams
|
||
try:
|
||
torch.cuda.synchronize()
|
||
torch.cuda.empty_cache()
|
||
except:
|
||
pass
|
||
|
||
# Estrategia 5: Forzar liberación de memoria reservada
|
||
if torch.cuda.memory_reserved(0) > 0:
|
||
logging.info(f"🧹 Intentando liberar memoria reservada: {torch.cuda.memory_reserved(0) / 1024**3:.2f}GB")
|
||
|
||
# Último recurso: intentar resetear el estado de CUDA
|
||
try:
|
||
# Liberar todos los caches posibles
|
||
if hasattr(torch.cuda, 'memory_snapshot'):
|
||
torch.cuda.memory_snapshot()
|
||
|
||
torch.cuda.empty_cache()
|
||
gc.collect()
|
||
|
||
# Si aún hay memoria reservada, intentar un enfoque más agresivo
|
||
if torch.cuda.memory_reserved(0) > 1024**3: # Más de 1GB
|
||
logging.warning("🚨 Usando liberación extrema de VRAM...")
|
||
|
||
# Forzar liberación completa del contexto
|
||
torch.cuda.set_device(0)
|
||
torch.cuda.empty_cache()
|
||
|
||
# Múltiples ciclos de limpieza
|
||
for _ in range(3):
|
||
gc.collect()
|
||
torch.cuda.empty_cache()
|
||
time.sleep(0.1) # Pequeña pausa para permitir liberación
|
||
|
||
except Exception as e:
|
||
logging.warning(f"Error en liberación extrema: {e}")
|
||
|
||
# Mostrar estado después de la limpieza
|
||
after_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||
after_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
||
logging.info(f"📊 Después de limpieza - Allocated: {after_allocated:.2f}GB, Reserved: {after_reserved:.2f}GB")
|
||
|
||
if after_reserved < before_reserved:
|
||
logging.info(f"✅ Memoria liberada: {(before_reserved - after_reserved):.2f}GB")
|
||
else:
|
||
logging.warning("⚠️ No se pudo liberar memoria reservada significativamente")
|
||
|
||
logging.info("✅ Limpieza agresiva de VRAM completada")
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error en limpieza agresiva de VRAM: {e}")
|
||
|
||
def _start_vram_cleanup_timer():
|
||
"""Inicia un hilo de monitoreo continuo para liberar VRAM"""
|
||
import threading
|
||
|
||
def cleanup_worker():
|
||
while True:
|
||
time.sleep(60) # Verificar cada 60 segundos (no tan frecuente)
|
||
_check_and_free_vram()
|
||
# Eliminar limpieza extrema adicional que interrumpe el procesamiento
|
||
|
||
thread = threading.Thread(target=cleanup_worker, daemon=True)
|
||
thread.start()
|
||
|
||
def _force_complete_vram_cleanup():
|
||
"""Fuerza una limpieza completa de VRAM para eliminar residuos"""
|
||
global _models_last_used
|
||
|
||
try:
|
||
if torch.cuda.is_available():
|
||
# Verificar si hay residuos
|
||
allocated_mb = torch.cuda.memory_allocated(0) / 1024**2
|
||
reserved_mb = torch.cuda.memory_reserved(0) / 1024**2
|
||
|
||
# Si hay más de 50MiB residuales, forzar limpieza extrema
|
||
if allocated_mb > 50 and (_models_last_used is None or
|
||
(datetime.utcnow() - _models_last_used).total_seconds() > 30):
|
||
|
||
logging.info(f"🔥 Limpieza extrema: {allocated_mb:.1f}MiB residuales detectados")
|
||
|
||
# Estrategia 1: Reset completo del contexto CUDA
|
||
try:
|
||
# Guardar dispositivo actual
|
||
current_device = torch.cuda.current_device()
|
||
|
||
# Liberar todo lo posible
|
||
torch.cuda.empty_cache()
|
||
import gc
|
||
gc.collect()
|
||
|
||
# Múltiples ciclos de limpieza
|
||
for i in range(5):
|
||
gc.collect()
|
||
torch.cuda.empty_cache()
|
||
time.sleep(0.05)
|
||
|
||
# Intentar resetear el dispositivo
|
||
if hasattr(torch.cuda, 'memory_snapshot'):
|
||
try:
|
||
torch.cuda.memory_snapshot()
|
||
except:
|
||
pass
|
||
|
||
# Sincronizar y limpiar
|
||
torch.cuda.synchronize()
|
||
torch.cuda.empty_cache()
|
||
|
||
# Volver al dispositivo original
|
||
torch.cuda.set_device(current_device)
|
||
|
||
# Verificar resultado
|
||
new_allocated_mb = torch.cuda.memory_allocated(0) / 1024**2
|
||
if new_allocated_mb < allocated_mb:
|
||
logging.info(f"✅ Limpieza extrema exitosa: {allocated_mb:.1f}MiB -> {new_allocated_mb:.1f}MiB")
|
||
|
||
except Exception as e:
|
||
logging.warning(f"Error en limpieza extrema: {e}")
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error en limpieza de VRAM: {e}")
|
||
|
||
def get_vram_usage():
|
||
"""Retorna información sobre el uso de VRAM"""
|
||
if torch.cuda.is_available():
|
||
total = torch.cuda.get_device_properties(0).total_memory / 1024**3 # GB
|
||
allocated = torch.cuda.memory_allocated(0) / 1024**3 # GB
|
||
cached = torch.cuda.memory_reserved(0) / 1024**3 # GB
|
||
free = total - allocated
|
||
|
||
return {
|
||
'total_gb': round(total, 2),
|
||
'allocated_gb': round(allocated, 2),
|
||
'cached_gb': round(cached, 2),
|
||
'free_gb': round(free, 2),
|
||
'whisper_loaded': _whisper_model is not None,
|
||
'ocr_models_loaded': _ocr_models is not None,
|
||
'trocr_models_loaded': _trocr_models is not None,
|
||
'any_models_loaded': _whisper_model is not None or _ocr_models is not None or _trocr_models is not None,
|
||
'last_used': _models_last_used.isoformat() if _models_last_used else None,
|
||
'timeout_seconds': _MODEL_TIMEOUT_SECONDS
|
||
}
|
||
else:
|
||
return {'error': 'CUDA not available'}
|
||
|
||
def force_free_vram():
|
||
"""Fuerza la liberación inmediata de VRAM"""
|
||
logging.info("🔧 Manual VRAM free triggered")
|
||
|
||
# Forzar liberación inmediata sin esperar timeout
|
||
global _whisper_model, _ocr_models, _trocr_models, _models_last_used
|
||
|
||
models_freed = []
|
||
|
||
# Liberar todos los modelos inmediatamente
|
||
if _whisper_model is not None:
|
||
try:
|
||
if torch.cuda.is_available():
|
||
del _whisper_model
|
||
_whisper_model = None
|
||
models_freed.append("Whisper")
|
||
except Exception as e:
|
||
logging.error(f"Error freeing Whisper VRAM: {e}")
|
||
|
||
if _ocr_models is not None:
|
||
try:
|
||
_ocr_models = None
|
||
models_freed.append("OCR")
|
||
except Exception as e:
|
||
logging.error(f"Error freeing OCR VRAM: {e}")
|
||
|
||
if _trocr_models is not None:
|
||
try:
|
||
if torch.cuda.is_available():
|
||
model = _trocr_models.get('model') if isinstance(_trocr_models, dict) else None
|
||
if model is not None:
|
||
model.to('cpu')
|
||
models_freed.append("TrOCR")
|
||
torch.cuda.empty_cache()
|
||
except Exception as e:
|
||
logging.error(f"Error freeing TrOCR VRAM: {e}")
|
||
|
||
# Limpiar variables globales
|
||
_whisper_model = None
|
||
_ocr_models = None
|
||
_trocr_models = None
|
||
_models_last_used = None
|
||
|
||
# Forzar limpieza agresiva
|
||
_force_aggressive_vram_cleanup()
|
||
|
||
if models_freed:
|
||
logging.info(f"🎯 Manual VRAM free - Models freed: {', '.join(models_freed)}")
|
||
|
||
return "VRAM freed successfully"
|
||
|
||
|
||
def ensure_local_directories() -> None:
|
||
"""Garantiza que las carpetas locales necesarias existan."""
|
||
for path in (LOCAL_DOWNLOADS_PATH, LOCAL_RESUMENES, LOCAL_DOCX):
|
||
Path(path).mkdir(parents=True, exist_ok=True)
|
||
|
||
# --- HELPER FUNCTIONS ---
|
||
def normalize_remote_path(path):
|
||
"""Normalize remote paths to a consistent representation."""
|
||
if not path:
|
||
return ""
|
||
normalized = unicodedata.normalize("NFC", str(path)).strip()
|
||
if not normalized:
|
||
return ""
|
||
normalized = normalized.replace("\\", "/")
|
||
normalized = re.sub(r"/+", "/", normalized)
|
||
return normalized.lstrip("/")
|
||
|
||
|
||
def _ensure_webdav_credentials() -> None:
|
||
missing = [
|
||
name for name, value in (
|
||
("NEXTCLOUD_URL", NEXTCLOUD_URL),
|
||
("NEXTCLOUD_USER", NEXTCLOUD_USER),
|
||
("NEXTCLOUD_PASS", NEXTCLOUD_PASS),
|
||
)
|
||
if not value
|
||
]
|
||
if missing:
|
||
raise RuntimeError(
|
||
"Missing Nextcloud WebDAV configuration: " + ", ".join(missing)
|
||
)
|
||
|
||
|
||
def _get_webdav_session() -> requests.Session:
|
||
global _WEBDAV_SESSION
|
||
if _WEBDAV_SESSION is None:
|
||
_ensure_webdav_credentials()
|
||
session = requests.Session()
|
||
session.auth = HTTPBasicAuth(NEXTCLOUD_USER, NEXTCLOUD_PASS)
|
||
adapter = HTTPAdapter(max_retries=WEBDAV_MAX_RETRIES)
|
||
session.mount("http://", adapter)
|
||
session.mount("https://", adapter)
|
||
_WEBDAV_SESSION = session
|
||
return _WEBDAV_SESSION
|
||
|
||
|
||
def _build_webdav_url(path: str) -> str:
|
||
_ensure_webdav_credentials()
|
||
base = (WEBDAV_ENDPOINT or "").rstrip("/")
|
||
if not base:
|
||
raise RuntimeError("NEXTCLOUD_URL is not configured")
|
||
normalized_path = normalize_remote_path(path)
|
||
return f"{base}/{normalized_path}" if normalized_path else base
|
||
|
||
|
||
def _snapshot_existing_remote_files():
|
||
"""Collect current remote files to seed the processed registry on first run."""
|
||
snapshot = set()
|
||
targets = [
|
||
(REMOTE_AUDIOS_FOLDER, AUDIO_EXTENSIONS),
|
||
(REMOTE_PDF_FOLDER, PDF_EXTENSIONS),
|
||
]
|
||
|
||
for remote_folder, extensions in targets:
|
||
try:
|
||
for remote_path in webdav_list(remote_folder):
|
||
normalized = normalize_remote_path(remote_path)
|
||
if not normalized:
|
||
continue
|
||
if not any(normalized.lower().endswith(ext) for ext in extensions):
|
||
continue
|
||
snapshot.add(normalized)
|
||
except Exception as e:
|
||
logging.warning(f"No se pudo obtener listado inicial de '{remote_folder}': {e}")
|
||
|
||
return snapshot
|
||
|
||
|
||
def _initialize_processed_registry():
|
||
"""Ensure the processed files registry exists, migrating legacy data if needed."""
|
||
target_dir = os.path.dirname(PROCESSED_FILES_PATH) or BASE_DIR
|
||
|
||
try:
|
||
os.makedirs(target_dir, exist_ok=True)
|
||
except Exception as e:
|
||
logging.error(f"No se pudo crear el directorio para el registro de procesados: {e}")
|
||
return
|
||
|
||
for legacy_path in LEGACY_PROCESSED_PATHS:
|
||
if not legacy_path:
|
||
continue
|
||
if os.path.abspath(legacy_path) == os.path.abspath(PROCESSED_FILES_PATH):
|
||
continue
|
||
if os.path.exists(legacy_path):
|
||
try:
|
||
shutil.copy2(legacy_path, PROCESSED_FILES_PATH)
|
||
logging.info(f"Registro de procesados migrado desde {legacy_path}")
|
||
return
|
||
except Exception as e:
|
||
logging.error(f"Error al migrar registro de {legacy_path}: {e}")
|
||
|
||
snapshot = _snapshot_existing_remote_files()
|
||
try:
|
||
with open(PROCESSED_FILES_PATH, "w", encoding="utf-8") as f:
|
||
timestamp = datetime.utcnow().isoformat() + "Z"
|
||
f.write(f"# Archivos procesados - inicializado {timestamp}\n")
|
||
for entry in sorted(snapshot):
|
||
f.write(entry + "\n")
|
||
if snapshot:
|
||
logging.info(f"Registro de procesados inicializado con {len(snapshot)} entradas existentes")
|
||
except Exception as e:
|
||
logging.error(f"No se pudo crear el registro de procesados: {e}")
|
||
|
||
|
||
def load_processed_files() -> ProcessedRegistry:
|
||
processed: ProcessedRegistry = set()
|
||
|
||
if not os.path.exists(PROCESSED_FILES_PATH):
|
||
_initialize_processed_registry()
|
||
|
||
if not os.path.exists(PROCESSED_FILES_PATH):
|
||
logging.warning("Registro de procesados no disponible; se procesarán todos los archivos encontrados.")
|
||
return processed
|
||
|
||
try:
|
||
with open(PROCESSED_FILES_PATH, "r", encoding="utf-8") as f:
|
||
for raw_line in f:
|
||
line = raw_line.strip()
|
||
if not line or line.startswith('#'):
|
||
continue
|
||
|
||
normalized = normalize_remote_path(line)
|
||
if not normalized:
|
||
continue
|
||
|
||
ext = os.path.splitext(normalized)[1].lower()
|
||
if not ext:
|
||
continue
|
||
|
||
processed.add(normalized)
|
||
base_name = os.path.basename(normalized)
|
||
processed.add(base_name)
|
||
|
||
# Retrocompatibilidad para entradas sin carpeta
|
||
if '/' not in normalized:
|
||
if ext in AUDIO_EXTENSIONS:
|
||
processed.add(f"{REMOTE_AUDIOS_FOLDER}/{base_name}")
|
||
elif ext in PDF_EXTENSIONS:
|
||
processed.add(f"{REMOTE_PDF_FOLDER}/{base_name}")
|
||
|
||
return processed
|
||
except Exception as e:
|
||
logging.error(f"Error reading processed files: {e}")
|
||
return processed
|
||
|
||
|
||
def save_processed_file(remote_path: str) -> None:
|
||
normalized = normalize_remote_path(remote_path)
|
||
if not normalized:
|
||
logging.warning(f"Cannot mark empty remote path as processed: {remote_path}")
|
||
return
|
||
|
||
try:
|
||
processed: ProcessedRegistry = load_processed_files()
|
||
if normalized in processed or os.path.basename(normalized) in processed:
|
||
logging.info(f"Archivo ya marcado como procesado: {normalized}")
|
||
return
|
||
|
||
with open(PROCESSED_FILES_PATH, "a", encoding="utf-8") as f:
|
||
f.write(normalized + "\n")
|
||
logging.info(f"Marcado como procesado: {normalized}")
|
||
except Exception as e:
|
||
logging.error(f"Error saving processed file {normalized}: {e}")
|
||
# Intentar crear el archivo y reintentar
|
||
try:
|
||
os.makedirs(os.path.dirname(PROCESSED_FILES_PATH) or BASE_DIR, exist_ok=True)
|
||
with open(PROCESSED_FILES_PATH, "w", encoding="utf-8") as f:
|
||
f.write("# Archivos procesados - recreado automáticamente\n")
|
||
f.write(normalized + "\n")
|
||
logging.info(f"Archivo de procesados recreado y guardado: {normalized}")
|
||
except Exception as e2:
|
||
logging.error(f"Error recreating processed files: {e2}")
|
||
|
||
def run_subprocess(cmd, timeout):
|
||
"""Run subprocess capturing stdout/stderr and raise a descriptive error on failure."""
|
||
cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=timeout)
|
||
if cp.returncode != 0:
|
||
stderr = cp.stderr.strip()
|
||
stdout = cp.stdout.strip()
|
||
raise Exception(f"Command {cmd} failed (rc={cp.returncode}). stderr: {stderr!s} stdout: {stdout!s}")
|
||
return cp
|
||
|
||
def clean_filename(name):
|
||
"""Reemplaza caracteres problemáticos para WebDAV/Nextcloud"""
|
||
name = re.sub(r'[\\/:"*?<>|]+', '_', name)
|
||
name = name.replace('...', '_')
|
||
name = name.replace(' ', '_')
|
||
return name
|
||
|
||
# --- WEBDAV FUNCTIONS ---
|
||
def webdav_list(path: str) -> list[str]:
|
||
"""Lista archivos en una carpeta de Nextcloud usando PROPFIND."""
|
||
session = _get_webdav_session()
|
||
normalized_target = normalize_remote_path(path)
|
||
response = None
|
||
try:
|
||
response = session.request(
|
||
"PROPFIND",
|
||
_build_webdav_url(normalized_target),
|
||
headers={"Depth": "1"},
|
||
timeout=HTTP_TIMEOUT,
|
||
)
|
||
response.raise_for_status()
|
||
|
||
root = ET.fromstring(response.content)
|
||
files: list[str] = []
|
||
prefixes = ["/remote.php/webdav/"]
|
||
if NEXTCLOUD_USER:
|
||
prefixes.append(f"/remote.php/dav/files/{NEXTCLOUD_USER}/")
|
||
|
||
for response_node in root.findall("{DAV:}response"):
|
||
href_element = response_node.find("{DAV:}href")
|
||
if href_element is None or not href_element.text:
|
||
continue
|
||
|
||
relative_path = requests.utils.unquote(href_element.text)
|
||
for prefix in prefixes:
|
||
if relative_path.startswith(prefix):
|
||
relative_path = relative_path[len(prefix):]
|
||
|
||
normalized_response = normalize_remote_path(relative_path)
|
||
if not normalized_response or normalized_response.endswith('/'):
|
||
continue
|
||
if normalized_response.strip('/') == normalized_target.strip('/'):
|
||
continue
|
||
files.append(normalized_response)
|
||
|
||
return files
|
||
except Exception as exc:
|
||
logging.error(f"WebDAV LIST falló para '{path}': {exc}")
|
||
return []
|
||
finally:
|
||
if response is not None:
|
||
response.close()
|
||
|
||
|
||
def webdav_download(remote_path: str, local_path: str) -> None:
|
||
"""Descarga un archivo de Nextcloud."""
|
||
session = _get_webdav_session()
|
||
local_file = Path(local_path)
|
||
local_file.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
response = session.get(
|
||
_build_webdav_url(remote_path),
|
||
stream=True,
|
||
timeout=HTTP_TIMEOUT,
|
||
)
|
||
try:
|
||
response.raise_for_status()
|
||
with local_file.open('wb') as handle:
|
||
for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
|
||
if chunk:
|
||
handle.write(chunk)
|
||
finally:
|
||
response.close()
|
||
|
||
|
||
def webdav_upload(local_path: str, remote_path: str) -> None:
|
||
"""Sube un archivo a Nextcloud."""
|
||
session = _get_webdav_session()
|
||
with open(local_path, 'rb') as payload:
|
||
response = session.put(
|
||
_build_webdav_url(remote_path),
|
||
data=payload,
|
||
timeout=HTTP_TIMEOUT,
|
||
)
|
||
response.raise_for_status()
|
||
|
||
|
||
def webdav_mkdir(remote_path: str) -> None:
|
||
"""Crea una carpeta en Nextcloud."""
|
||
session = _get_webdav_session()
|
||
response = None
|
||
try:
|
||
response = session.request(
|
||
"MKCOL",
|
||
_build_webdav_url(remote_path),
|
||
timeout=HTTP_TIMEOUT,
|
||
)
|
||
if response.status_code in (200, 201, 204, 405):
|
||
return
|
||
response.raise_for_status()
|
||
except Exception as exc:
|
||
logging.error(f"WebDAV MKCOL falló para '{remote_path}': {exc}")
|
||
finally:
|
||
if response is not None:
|
||
response.close()
|
||
|
||
|
||
# --- CLAUDE (GLM-4.6) HELPERS ---
|
||
def get_claude_env(model: Optional[str] = None) -> Dict[str, str]:
|
||
env = os.environ.copy()
|
||
env.setdefault('ANTHROPIC_BASE_URL', ZAI_BASE_URL)
|
||
if ZAI_AUTH_TOKEN_FALLBACK:
|
||
env.setdefault('ANTHROPIC_AUTH_TOKEN', ZAI_AUTH_TOKEN_FALLBACK)
|
||
env['CLAUDE_DANGEROUSLY_SKIP_PERMISSIONS'] = '1'
|
||
|
||
chosen_model = model or ZAI_DEFAULT_MODEL
|
||
if chosen_model:
|
||
env.setdefault('CLAUDE_MODEL', chosen_model)
|
||
env.setdefault('CLAUDE_DEFAULT_MODEL', chosen_model)
|
||
env.setdefault('ANTHROPIC_DEFAULT_MODEL', chosen_model)
|
||
|
||
return env
|
||
|
||
|
||
def run_claude_cli(prompt: str, timeout: int = 300, model: Optional[str] = None) -> str:
|
||
env = get_claude_env(model)
|
||
cmd = ['claude', '--dangerously-skip-permissions']
|
||
|
||
process = subprocess.run(
|
||
cmd,
|
||
input=prompt,
|
||
env=env,
|
||
text=True,
|
||
capture_output=True,
|
||
timeout=timeout,
|
||
)
|
||
|
||
if process.returncode != 0:
|
||
stderr = (process.stderr or '').strip()
|
||
stdout = (process.stdout or '').strip()
|
||
message = stderr or stdout or 'sin salida'
|
||
raise RuntimeError(f"Claude CLI failed (rc={process.returncode}): {message}")
|
||
|
||
return (process.stdout or '').strip()
|
||
|
||
def _get_gemini_env(model_name: Optional[str] = None) -> Dict[str, str]:
|
||
env = os.environ.copy()
|
||
if GEMINI_API_KEY:
|
||
env.setdefault("GEMINI_API_KEY", GEMINI_API_KEY)
|
||
if model_name:
|
||
env.setdefault("GEMINI_MODEL", model_name)
|
||
return env
|
||
|
||
def _call_gemini_api(prompt: str, use_flash: bool = True, timeout: int = 180) -> str:
|
||
if not GEMINI_API_KEY:
|
||
raise RuntimeError("Gemini API key not configured")
|
||
|
||
if use_flash:
|
||
model = GEMINI_FLASH_MODEL or "gemini-2.5-flash"
|
||
else:
|
||
model = GEMINI_PRO_MODEL or "gemini-2.5-pro-preview-06-05"
|
||
endpoint = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
|
||
payload = {
|
||
"contents": [
|
||
{
|
||
"parts": [
|
||
{"text": prompt}
|
||
]
|
||
}
|
||
]
|
||
}
|
||
|
||
try:
|
||
response = requests.post(
|
||
endpoint,
|
||
params={"key": GEMINI_API_KEY},
|
||
json=payload,
|
||
timeout=timeout,
|
||
)
|
||
response.raise_for_status()
|
||
except requests.RequestException as exc:
|
||
raise RuntimeError(f"Gemini API request failed: {exc}") from exc
|
||
|
||
try:
|
||
data = response.json()
|
||
except ValueError as exc:
|
||
raise RuntimeError("Gemini API returned a non-JSON response") from exc
|
||
|
||
prompt_feedback = data.get("promptFeedback", {})
|
||
if prompt_feedback.get("blockReason"):
|
||
raise RuntimeError(f"Gemini prompt blocked: {prompt_feedback.get('blockReason')}")
|
||
|
||
candidates = data.get("candidates") or []
|
||
for candidate in candidates:
|
||
finish_reason = candidate.get("finishReason")
|
||
if finish_reason and finish_reason not in ("STOP", "FINISH_REASON_UNSPECIFIED"):
|
||
logging.warning(f"Gemini candidate finalizado con estado {finish_reason}, intentando leer contenido igualmente.")
|
||
parts = candidate.get("content", {}).get("parts", []) or []
|
||
texts = [part.get("text", "") for part in parts if part.get("text")]
|
||
if texts:
|
||
return "\n".join(texts).strip()
|
||
|
||
raise RuntimeError("Gemini API returned empty response")
|
||
|
||
def _call_gemini_cli(prompt: str, use_yolo: bool = True, timeout: int = 300) -> str:
|
||
if not GEMINI_CLI_PATH:
|
||
raise FileNotFoundError("Gemini CLI binary not found")
|
||
|
||
cmd = [GEMINI_CLI_PATH]
|
||
if use_yolo:
|
||
cmd.append("--yolo")
|
||
|
||
model_name = (GEMINI_FLASH_MODEL or "gemini-2.5-flash") if use_yolo else (GEMINI_PRO_MODEL or "gemini-2.5-pro-preview-06-05")
|
||
|
||
process = subprocess.run(
|
||
cmd,
|
||
input=prompt,
|
||
env=_get_gemini_env(model_name),
|
||
text=True,
|
||
capture_output=True,
|
||
timeout=timeout,
|
||
)
|
||
|
||
if process.returncode != 0:
|
||
stderr = (process.stderr or '').strip()
|
||
stdout = (process.stdout or '').strip()
|
||
message = stderr or stdout or 'sin salida'
|
||
raise RuntimeError(f"Gemini CLI failed (rc={process.returncode}): {message}")
|
||
|
||
output = (process.stdout or '').strip()
|
||
if not output:
|
||
raise RuntimeError("Gemini CLI returned empty output")
|
||
return output
|
||
|
||
# --- AUDIO PROCESSING FUNCTIONS ---
|
||
def transcribe_audio(audio_path, output_path):
|
||
"""Transcribe audio usando Whisper con configuración optimizada para español"""
|
||
global _whisper_model
|
||
|
||
# Check and free VRAM if models are idle
|
||
_check_and_free_vram()
|
||
|
||
# Load Whisper model if not already loaded
|
||
if _whisper_model is None:
|
||
try:
|
||
logging.info("Loading Whisper model (medium) for Spanish transcription...")
|
||
# Liberar memoria CUDA primero
|
||
torch.cuda.empty_cache()
|
||
# Configurar entorno para mejor manejo de CUDA
|
||
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
||
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
||
_whisper_model = whisper.load_model("medium", device="cuda")
|
||
logging.info("✅ Whisper model loaded successfully on GPU")
|
||
except RuntimeError as e:
|
||
if "CUDA" in str(e) or "GPU" in str(e):
|
||
error_msg = f"❌ Error cargando Whisper en GPU: {e}"
|
||
logging.error(error_msg)
|
||
send_telegram_message(error_msg)
|
||
# Liberar memoria y reintentar
|
||
torch.cuda.empty_cache()
|
||
time.sleep(2)
|
||
_whisper_model = whisper.load_model("medium", device="cuda")
|
||
logging.info("✅ Whisper model loaded successfully on GPU (retry)")
|
||
else:
|
||
raise
|
||
|
||
# Update usage timestamp
|
||
_update_models_usage()
|
||
|
||
logging.info("Starting audio transcription with Spanish optimization...")
|
||
try:
|
||
# Configuración más rápida para español
|
||
result = _whisper_model.transcribe(
|
||
audio_path,
|
||
language="es", # Forzar español
|
||
task="transcribe",
|
||
temperature=0.0, # Menos aleatoriedad
|
||
beam_size=1, # Más rápido
|
||
condition_on_previous_text=False, # Evitar bucles
|
||
fp16=True, # Más rápido
|
||
verbose=False
|
||
)
|
||
except RuntimeError as e:
|
||
if "CUDA" in str(e):
|
||
error_msg = f"❌ CUDA error durante transcripción: {e}"
|
||
logging.error(error_msg)
|
||
send_telegram_message(error_msg)
|
||
# Reintentar con GPU con configuración más ligera
|
||
try:
|
||
logging.info("🔄 Reintentando transcripción con GPU (config ligera)...")
|
||
if _whisper_model is not None:
|
||
del _whisper_model
|
||
torch.cuda.empty_cache()
|
||
time.sleep(2)
|
||
_whisper_model = whisper.load_model("base", device="cuda")
|
||
result = _whisper_model.transcribe(
|
||
audio_path,
|
||
language="es",
|
||
task="transcribe",
|
||
temperature=0.0,
|
||
best_of=3,
|
||
beam_size=3,
|
||
patience=1.0,
|
||
initial_prompt="Este es un audio en español. Hablará claramente y de forma fluida.",
|
||
condition_on_previous_text=True,
|
||
verbose=True
|
||
)
|
||
logging.info("✅ Transcripción completada con GPU (modelo base)")
|
||
except Exception as gpu_error:
|
||
logging.error(f"❌ Error crítico en transcripción con GPU: {gpu_error}")
|
||
raise RuntimeError(f"❌ Error crítico en transcripción: {gpu_error}")
|
||
else:
|
||
raise
|
||
|
||
# Actualizar timestamp durante el procesamiento
|
||
_update_models_usage()
|
||
|
||
# Post-procesamiento para mejorar español
|
||
with open(output_path, "w", encoding="utf-8") as f:
|
||
for seg in result["segments"]:
|
||
start = int(seg["start"])
|
||
hours = start // 3600
|
||
minutes = (start % 3600) // 60
|
||
seconds = start % 60
|
||
timestamp = f"[{hours:02}:{minutes:02}:{seconds:02}]"
|
||
|
||
# Limpiar y normalizar texto
|
||
text = seg['text'].strip()
|
||
|
||
# Correcciones comunes para español (gallego a español)
|
||
text = text.replace("xeo", "yo")
|
||
text = text.replace("non", "no")
|
||
text = text.replace("hai", "hay")
|
||
text = text.replace("entóns", "entonces")
|
||
text = text.replace("máis", "más")
|
||
text = text.replace("tamén", "también")
|
||
text = text.replace("sempre", "siempre")
|
||
text = text.replace("verdade", "verdad")
|
||
text = text.replace("cousa", "cosa")
|
||
text = text.replace("xente", "gente")
|
||
text = text.replace("tempo", "tiempo")
|
||
text = text.replace("lingua", "lengua")
|
||
text = text.replace("pode", "puede")
|
||
text = text.replace("xamón", "shogun")
|
||
text = text.replace("xomón", "shogun")
|
||
text = text.replace("unha", "una")
|
||
text = text.replace("dunha", "de una")
|
||
text = text.replace("nunha", "en una")
|
||
text = text.replace("xeral", "general")
|
||
text = text.replace("xeraria", "jerarquía")
|
||
text = text.replace("ximéas", "temas")
|
||
text = text.replace("ximeas", "temas")
|
||
text = text.replace("ronquera", "reunión")
|
||
text = text.replace("xocalizar", "juntar")
|
||
text = text.replace("oanxacular", "juntar")
|
||
text = text.replace("xocal", "junto")
|
||
text = text.replace("lúmulo", "grupo")
|
||
text = text.replace("lúmido", "grupo")
|
||
text = text.replace("lúmada", "grupos")
|
||
text = text.replace("nulunxación", "reunificación")
|
||
text = text.replace("xotalipa", "capitalista")
|
||
text = text.replace("crente", "gente")
|
||
text = text.replace("enxucar", "juntar")
|
||
|
||
# Normalizar puntuación y espacios
|
||
text = re.sub(r'\s+', ' ', text)
|
||
text = text.strip()
|
||
|
||
f.write(f"{timestamp} {text}\n")
|
||
|
||
# Actualizar timestamp al finalizar
|
||
_update_models_usage()
|
||
logging.info(f"Transcription saved to {output_path}")
|
||
|
||
def run_gemini(prompt, use_flash=True):
|
||
"""Genera contenido usando Claude (GLM-4.6) con fallback a la CLI y API de Gemini."""
|
||
claude_error = None
|
||
gemini_cli_error = None
|
||
|
||
if CLAUDE_CLI_PATH or ZAI_AUTH_TOKEN_FALLBACK:
|
||
try:
|
||
return run_claude_cli(prompt, timeout=300)
|
||
except FileNotFoundError as exc:
|
||
claude_error = exc
|
||
logging.warning("Claude CLI no disponible, utilizando Gemini como fallback.")
|
||
except Exception as exc:
|
||
claude_error = exc
|
||
logging.error(f"Claude CLI error: {exc}")
|
||
|
||
if GEMINI_CLI_PATH:
|
||
try:
|
||
result = _call_gemini_cli(prompt, use_yolo=True)
|
||
if claude_error:
|
||
logging.info("Gemini CLI respondió correctamente tras fallo de Claude CLI.")
|
||
return result
|
||
except FileNotFoundError as exc:
|
||
gemini_cli_error = exc
|
||
logging.warning("Gemini CLI no disponible en el sistema.")
|
||
except Exception as exc:
|
||
gemini_cli_error = exc
|
||
logging.error(f"Gemini CLI error: {exc}")
|
||
|
||
if GEMINI_API_KEY:
|
||
try:
|
||
result = _call_gemini_api(prompt, use_flash=use_flash)
|
||
if claude_error or gemini_cli_error:
|
||
logging.info("Gemini API respondió correctamente tras fallos previos.")
|
||
return result
|
||
except Exception as gemini_exc:
|
||
logging.error(f"Gemini API error: {gemini_exc}")
|
||
errors = []
|
||
if claude_error:
|
||
errors.append(f"Claude CLI: {claude_error}")
|
||
if gemini_cli_error:
|
||
errors.append(f"Gemini CLI: {gemini_cli_error}")
|
||
if errors:
|
||
errors.append(f"Gemini API: {gemini_exc}")
|
||
return " ; ".join(f"Error {e}" for e in errors)
|
||
return f"Error Gemini API: {gemini_exc}"
|
||
|
||
if claude_error:
|
||
base_error = f"Error Claude CLI: {claude_error}"
|
||
if gemini_cli_error:
|
||
return f"{base_error}; Error Gemini CLI: {gemini_cli_error}"
|
||
return base_error
|
||
|
||
if gemini_cli_error:
|
||
return f"Error Gemini CLI: {gemini_cli_error}"
|
||
|
||
return "Error: No hay servicios de resumen disponibles (Claude/Gemini)."
|
||
|
||
def run_gemini_api_fallback(prompt, use_flash=True):
|
||
"""Compatibilidad: delega en la misma llamada local."""
|
||
return run_gemini(prompt, use_flash=use_flash)
|
||
|
||
def run_gemini_summary(prompt):
|
||
"""Genera resumen usando GLM-4.6 (compatibilidad)."""
|
||
return run_gemini(prompt, use_flash=True)
|
||
|
||
def run_ollama(prompt):
|
||
"""Genera contenido usando Ollama"""
|
||
payload = {
|
||
"model": OLLAMA_MODEL,
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"stream": False
|
||
}
|
||
try:
|
||
r = requests.post(f"{OLLAMA_HOST}/api/chat", json=payload, timeout=120)
|
||
r.raise_for_status()
|
||
response = r.json()
|
||
return response['message']['content']
|
||
except Exception as e:
|
||
return f"Error Ollama: {e}"
|
||
|
||
# --- CLASIFICACIÓN INTELIGENTE DE CONTENIDO ---
|
||
def classify_content_intelligent(text_content):
|
||
"""Clasifica el contenido del resumen en categorías temáticas usando IA"""
|
||
classification_prompt = f"""
|
||
Analiza el siguiente contenido y clasifícalo en UNA de estas 4 categorías:
|
||
|
||
1. HISTORIA - Contenido sobre eventos históricos, cronologías, guerras, revoluciones, personajes históricos, civilizaciones antiguas, historia política, social o económica.
|
||
|
||
2. ANALISIS CONTABLE - Contenido sobre contabilidad, finanzas, balances, estados financieros, costos, presupuestos, auditorías, impuestos, análisis de inversiones, contabilidad de costos.
|
||
|
||
3. INSTITUCIONES DEL GOBIERNO - Contenido sobre gobierno, política, ideologías políticas, instituciones estatales, administración pública, leyes, reglamentos, políticas públicas, estructura gubernamental.
|
||
|
||
4. OTRAS CLASES - Contenido que no encaja en las categorías anteriores: ciencias, tecnología, literatura, arte, filosofía, educación, medicina, derecho, etc.
|
||
|
||
Instrucciones:
|
||
- Responde ÚNICAMENTE con el nombre de la categoría (HISTORIA, ANALISIS CONTABLE, INSTITUCIONES DEL GOBIERNO, OTRAS CLASES)
|
||
- No incluyas explicaciones ni texto adicional
|
||
- Basa tu decisión en el contenido general del texto
|
||
|
||
Contenido a clasificar:
|
||
{text_content}
|
||
"""
|
||
|
||
try:
|
||
# Usar GLM-4.6 para la clasificación
|
||
classification = run_gemini_summary(classification_prompt)
|
||
|
||
# Limpiar y normalizar la respuesta
|
||
classification = classification.strip().upper()
|
||
|
||
# Mapear las respuestas a las claves del diccionario
|
||
category_mapping = {
|
||
"HISTORIA": "historia",
|
||
"ANALISIS CONTABLE": "analisis_contable",
|
||
"ANALISIS CONTABLE": "analisis_contable",
|
||
"INSTITUCIONES DEL GOBIERNO": "instituciones_gobierno",
|
||
"INSTITUCIONES DE GOBIERNO": "instituciones_gobierno",
|
||
"GOBIERNO": "instituciones_gobierno",
|
||
"POLITICA": "instituciones_gobierno",
|
||
"POLÍTICA": "instituciones_gobierno",
|
||
"OTRAS CLASES": "otras_clases",
|
||
"OTRAS": "otras_clases"
|
||
}
|
||
|
||
# Buscar coincidencia exacta primero
|
||
if classification in category_mapping:
|
||
return category_mapping[classification]
|
||
|
||
# Si no hay coincidencia exacta, buscar por palabras clave
|
||
for key, value in category_mapping.items():
|
||
if key in classification:
|
||
return value
|
||
|
||
# Si no se puede clasificar, usar categoría por defecto
|
||
logging.warning(f"⚠️ No se pudo clasificar el contenido: '{classification}', usando categoría por defecto")
|
||
return "otras_clases"
|
||
|
||
except Exception as e:
|
||
logging.error(f"❌ Error en clasificación inteligente: {e}")
|
||
return "otras_clases" # Categoría por defecto en caso de error
|
||
|
||
def ensure_thematic_folders_exist():
|
||
"""Asegura que las carpetas temáticas existan en Nextcloud"""
|
||
for folder_key, folder_name in TEMATIC_FOLDERS.items():
|
||
try:
|
||
webdav_mkdir(folder_name)
|
||
logging.info(f"📁 Verificada/creada carpeta: {folder_name}")
|
||
except Exception as e:
|
||
logging.error(f"❌ Error creando carpeta {folder_name}: {e}")
|
||
|
||
def get_upload_path_for_category(category_key, filename):
|
||
"""Retorna la ruta de subida según la categoría"""
|
||
if category_key in TEMATIC_FOLDERS:
|
||
folder_name = TEMATIC_FOLDERS[category_key]
|
||
return os.path.join(folder_name, filename)
|
||
else:
|
||
# Por defecto usar Otras Clases
|
||
return os.path.join(TEMATIC_FOLDERS["otras_clases"], filename)
|
||
|
||
# --- EXTRACCIÓN DE TEMAS Y RENOMBRADO AUTOMÁTICO ---
|
||
def extract_key_topics_from_text(text):
|
||
"""Extrae temas principales del texto usando IA"""
|
||
if not text or len(text) < 100:
|
||
return ["Temas principales"]
|
||
|
||
topics_prompt = f"""
|
||
Analiza el siguiente texto y extrae los 2-3 temas principales más importantes.
|
||
Responde ÚNICAMENTE con los temas separados por comas, sin explicaciones.
|
||
Usa máximo 3 palabras por tema.
|
||
|
||
Ejemplos de respuesta correcta:
|
||
"Revolución Francesa, Ilustración, Monarquía"
|
||
"Contabilidad financiera, Estados contables, Análisis de ratios"
|
||
"Gobierno democrático, Separación de poderes, Constitución"
|
||
|
||
Texto a analizar:
|
||
{text[:2000]} # Limitar texto para no exceder tokens
|
||
"""
|
||
|
||
try:
|
||
topics_response = run_gemini_summary(topics_prompt)
|
||
|
||
# Limpiar y procesar la respuesta
|
||
topics = []
|
||
for topic in topics_response.split(','):
|
||
topic = topic.strip().title()
|
||
if topic and len(topic) > 2:
|
||
# Limpiar caracteres no deseados
|
||
topic = re.sub(r'[^\w\sáéíóúüñÁÉÍÓÚÜÑ-]', '', topic)
|
||
if topic:
|
||
topics.append(topic)
|
||
|
||
# Asegurar al menos 2 temas
|
||
if len(topics) == 1 and len(topics[0]) > 20:
|
||
# Si el tema es muy largo, dividirlo
|
||
words = topics[0].split()
|
||
if len(words) >= 4:
|
||
topics = [words[0] + " " + words[1], words[2] + " " + words[3]]
|
||
elif len(topics) < 2:
|
||
topics.append("Temas principales")
|
||
|
||
# Limitar a 2-3 temas
|
||
topics = topics[:3]
|
||
|
||
return topics
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error extrayendo temas: {e}")
|
||
return ["Temas principales", "Contenido académico"]
|
||
|
||
def clean_filename_for_topics(name: str, max_length: Optional[int] = None) -> str:
|
||
"""Normaliza un nombre de archivo, preservando la extensión."""
|
||
if not name:
|
||
return "archivo"
|
||
|
||
sanitized = re.sub(r'[<>:"/\\|?*]+', '', name)
|
||
sanitized = re.sub(r'\s+', ' ', sanitized).strip()
|
||
if not sanitized:
|
||
return "archivo"
|
||
|
||
limit = max_length or MAX_FILENAME_LENGTH
|
||
if limit <= 0:
|
||
return sanitized
|
||
|
||
if len(sanitized) <= limit:
|
||
return sanitized
|
||
|
||
stem, ext = os.path.splitext(sanitized)
|
||
if not ext:
|
||
truncated = sanitized[:limit].rstrip(' .-_')
|
||
return truncated or "archivo"
|
||
|
||
available = max(1, limit - len(ext))
|
||
truncated_stem = stem[:available].rstrip(' .-_')
|
||
if not truncated_stem:
|
||
truncated_stem = "archivo"
|
||
|
||
candidate = f"{truncated_stem}{ext}"
|
||
if len(candidate) <= limit:
|
||
return candidate
|
||
|
||
# Ajuste final si la extensión por sí sola excede el límite
|
||
if len(ext) >= limit:
|
||
return ext[-limit:]
|
||
|
||
final_stem = truncated_stem[: limit - len(ext)].rstrip(' .-_') or "archivo"
|
||
return f"{final_stem}{ext}"
|
||
|
||
|
||
def ensure_unique_local_filename(directory: Path, filename: str) -> str:
|
||
"""Garantiza que el nombre no colisione en el directorio indicado."""
|
||
candidate = clean_filename_for_topics(filename, MAX_FILENAME_LENGTH)
|
||
path = directory / candidate
|
||
if not path.exists():
|
||
return candidate
|
||
|
||
stem, ext = os.path.splitext(candidate)
|
||
counter = 1
|
||
while True:
|
||
suffix = f"-{counter}"
|
||
new_name = f"{stem}{suffix}{ext}"
|
||
new_name = clean_filename_for_topics(new_name, MAX_FILENAME_LENGTH)
|
||
if not (directory / new_name).exists():
|
||
return new_name
|
||
counter += 1
|
||
|
||
|
||
def _append_markdown_to_doc(doc: Document, markdown_text: str) -> None:
|
||
lines = markdown_text.splitlines()
|
||
current_paragraph = []
|
||
|
||
for raw_line in lines:
|
||
line = raw_line.rstrip()
|
||
|
||
if not line.strip():
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
current_paragraph = []
|
||
continue
|
||
|
||
stripped = line.lstrip()
|
||
|
||
if stripped.startswith('#'):
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
current_paragraph = []
|
||
level = len(stripped) - len(stripped.lstrip('#'))
|
||
heading_text = stripped.lstrip('#').strip()
|
||
if heading_text:
|
||
doc.add_heading(heading_text, level=max(1, min(6, level)))
|
||
continue
|
||
|
||
if stripped.startswith(('-', '*', '•')):
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
current_paragraph = []
|
||
bullet_text = stripped.lstrip('-*• ').strip()
|
||
if bullet_text:
|
||
doc.add_paragraph(bullet_text, style='List Bullet')
|
||
continue
|
||
|
||
current_paragraph.append(line.strip())
|
||
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
|
||
|
||
def markdown_to_docx(markdown_text: str, output_path: Path, quiz_source: Optional[str] = None) -> None:
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
doc = Document()
|
||
doc.add_heading('Resumen generado con GLM-4.6', level=1)
|
||
doc.add_paragraph('Este documento fue sintetizado automáticamente usando GLM-4.6 a través de la CLI de Claude (z.ai).')
|
||
doc.add_page_break()
|
||
|
||
_append_markdown_to_doc(doc, markdown_text)
|
||
|
||
quiz_input = quiz_source or markdown_text
|
||
if quiz_input:
|
||
logging.info("🎯 Generando quiz con GLM-4.6...")
|
||
try:
|
||
questions, answers = generate_quiz(quiz_input)
|
||
if questions and answers:
|
||
add_quiz_to_docx(doc, questions, answers)
|
||
logging.info("✅ Quiz agregado al documento")
|
||
except Exception as quiz_error:
|
||
logging.error(f"❌ Error generando quiz: {quiz_error}")
|
||
|
||
doc.save(str(output_path))
|
||
|
||
|
||
def markdown_to_pdf(markdown_text: str, pdf_path: Path, title: Optional[str] = None) -> None:
|
||
pdf_path.parent.mkdir(parents=True, exist_ok=True)
|
||
canvas_obj = canvas.Canvas(str(pdf_path), pagesize=letter)
|
||
width, height = letter
|
||
margin = 72
|
||
y_position = height - margin
|
||
|
||
def new_page():
|
||
nonlocal y_position
|
||
canvas_obj.showPage()
|
||
canvas_obj.setFont('Helvetica', 11)
|
||
y_position = height - margin
|
||
|
||
canvas_obj.setFont('Helvetica', 11)
|
||
|
||
if title:
|
||
canvas_obj.setFont('Helvetica-Bold', 16)
|
||
canvas_obj.drawString(margin, y_position, title[:100])
|
||
y_position -= 28
|
||
canvas_obj.setFont('Helvetica', 11)
|
||
|
||
for raw_line in markdown_text.splitlines():
|
||
line = raw_line.rstrip()
|
||
|
||
if not line.strip():
|
||
y_position -= 14
|
||
if y_position < margin:
|
||
new_page()
|
||
continue
|
||
|
||
stripped = line.lstrip()
|
||
|
||
if stripped.startswith('#'):
|
||
level = len(stripped) - len(stripped.lstrip('#'))
|
||
heading_text = stripped.lstrip('#').strip()
|
||
if heading_text:
|
||
font_size = 16 if level == 1 else 14 if level == 2 else 12
|
||
canvas_obj.setFont('Helvetica-Bold', font_size)
|
||
canvas_obj.drawString(margin, y_position, heading_text[:120])
|
||
y_position -= font_size + 6
|
||
if y_position < margin:
|
||
new_page()
|
||
canvas_obj.setFont('Helvetica', 11)
|
||
continue
|
||
|
||
if stripped.startswith(('-', '*', '•')):
|
||
bullet_text = stripped.lstrip('-*•').strip()
|
||
wrapped_lines = textwrap.wrap(bullet_text, width=80) or ['']
|
||
for idx, wrapped in enumerate(wrapped_lines):
|
||
prefix = '• ' if idx == 0 else ' '
|
||
canvas_obj.drawString(margin, y_position, f"{prefix}{wrapped}")
|
||
y_position -= 14
|
||
if y_position < margin:
|
||
new_page()
|
||
continue
|
||
|
||
wrapped_lines = textwrap.wrap(stripped, width=90) or ['']
|
||
for wrapped in wrapped_lines:
|
||
canvas_obj.drawString(margin, y_position, wrapped)
|
||
y_position -= 14
|
||
if y_position < margin:
|
||
new_page()
|
||
|
||
canvas_obj.save()
|
||
|
||
def generate_intelligent_filename(base_name, summary_content):
|
||
"""Genera nombre de archivo inteligente con temas extraídos"""
|
||
try:
|
||
# Extraer temas principales
|
||
topics = extract_key_topics_from_text(summary_content)
|
||
topics_str = ' - '.join(topics)
|
||
|
||
# Limpiar el nombre base original con una longitud razonable
|
||
clean_base = clean_filename_for_topics(
|
||
base_name.replace('_unificado', ''),
|
||
MAX_FILENAME_BASE_LENGTH,
|
||
)
|
||
if clean_base.lower() == "archivo":
|
||
clean_base = "Resumen"
|
||
|
||
clean_topics = ''
|
||
if topics_str:
|
||
clean_topics = clean_filename_for_topics(topics_str, MAX_FILENAME_TOPICS_LENGTH)
|
||
if clean_topics.lower() == "archivo":
|
||
clean_topics = ''
|
||
|
||
parts = [clean_base]
|
||
if clean_topics:
|
||
parts.append(clean_topics)
|
||
|
||
candidate = ' - '.join(parts) + '_unificado.docx'
|
||
intelligent_name = clean_filename_for_topics(candidate, MAX_FILENAME_LENGTH)
|
||
|
||
logging.info(f"🎯 Temas extraídos: {topics_str}")
|
||
return intelligent_name
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error generando nombre inteligente: {e}")
|
||
# Retornar nombre por defecto si falla
|
||
return f"{base_name}_unificado.docx"
|
||
|
||
# --- QUIZ GENERATION FUNCTIONS ---
|
||
def generate_quiz(summary_text):
|
||
"""Genera un quiz de 10 preguntas basado en el resumen"""
|
||
prompt = f"""
|
||
Basándote en el siguiente resumen, genera exactamente 10 preguntas de opción múltiple en español.
|
||
Cada pregunta debe tener 4 opciones (A, B, C, D) y solo una respuesta correcta.
|
||
Las preguntas deben cubrir los puntos más importantes del resumen.
|
||
|
||
Formato requerido:
|
||
PREGUNTA 1: [texto de la pregunta]
|
||
A) [opción A]
|
||
B) [opción B]
|
||
C) [opción C]
|
||
D) [opción D]
|
||
RESPUESTA: [letra correcta]
|
||
|
||
PREGUNTA 2: [texto de la pregunta]
|
||
A) [opción A]
|
||
B) [opción B]
|
||
C) [opción C]
|
||
D) [opción D]
|
||
RESPUESTA: [letra correcta]
|
||
|
||
[continúa hasta la pregunta 10]
|
||
|
||
Resumen:
|
||
{summary_text}
|
||
"""
|
||
|
||
logging.info("🎯 Generating quiz with GLM-4.6...")
|
||
response = run_gemini(prompt)
|
||
|
||
if "Error" in response:
|
||
logging.error(f"❌ Error generating quiz: {response}")
|
||
return None, None
|
||
|
||
# Parse response to separate questions and answers
|
||
questions = []
|
||
answers = []
|
||
|
||
lines = response.strip().split('\n')
|
||
current_question = None
|
||
current_options = []
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if line.startswith('PREGUNTA'):
|
||
if current_question:
|
||
questions.append(f"{current_question}\n" + "\n".join(current_options))
|
||
current_options = []
|
||
current_question = line
|
||
elif line.startswith(('A)', 'B)', 'C)', 'D)')):
|
||
current_options.append(line)
|
||
elif line.startswith('RESPUESTA:'):
|
||
answer = line.replace('RESPUESTA:', '').strip()
|
||
answers.append(answer)
|
||
|
||
# Add the last question
|
||
if current_question:
|
||
questions.append(f"{current_question}\n" + "\n".join(current_options))
|
||
|
||
return questions, answers
|
||
|
||
def add_quiz_to_docx(doc, questions, answers):
|
||
"""Agrega el quiz al documento DOCX"""
|
||
doc.add_page_break()
|
||
doc.add_heading('Quiz de Evaluación', level=1)
|
||
doc.add_paragraph('Responde las siguientes preguntas basándote en el resumen anterior.')
|
||
doc.add_paragraph('')
|
||
|
||
# Add questions
|
||
for i, question in enumerate(questions, 1):
|
||
doc.add_paragraph(question)
|
||
doc.add_paragraph('')
|
||
|
||
# Add answers
|
||
doc.add_page_break()
|
||
doc.add_heading('Respuestas del Quiz', level=1)
|
||
|
||
for i, answer in enumerate(answers, 1):
|
||
doc.add_paragraph(f"Pregunta {i}: {answer}")
|
||
|
||
# --- DOCUMENT GENERATION FUNCTIONS ---
|
||
def save_summary_docx(content, model_name, filename, text_for_quiz=None):
|
||
"""Guarda el resumen en formato DOCX con formato mejorado (legacy function)"""
|
||
doc = Document()
|
||
doc.add_heading('Resumen generado', level=1)
|
||
|
||
# Procesar contenido
|
||
lines = content.splitlines()
|
||
current_paragraph = []
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
current_paragraph = []
|
||
continue
|
||
|
||
if line.startswith('#'):
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
current_paragraph = []
|
||
# Procesar encabezado
|
||
level = len(line) - len(line.lstrip('#'))
|
||
if level <= 6:
|
||
doc.add_heading(line.lstrip('#').strip(), level=level)
|
||
else:
|
||
current_paragraph.append(line)
|
||
elif line.startswith('-') or line.startswith('•'):
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
current_paragraph = []
|
||
doc.add_paragraph(line.lstrip('-•').strip(), style='List Bullet')
|
||
else:
|
||
current_paragraph.append(line)
|
||
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
|
||
# Add quiz if text is provided
|
||
if text_for_quiz:
|
||
logging.info("🎯 Generating quiz...")
|
||
try:
|
||
quiz_text = text_for_quiz if text_for_quiz else content
|
||
questions, answers = generate_quiz(quiz_text)
|
||
if questions and answers:
|
||
add_quiz_to_docx(doc, questions, answers)
|
||
logging.info("✅ Quiz added to document")
|
||
except Exception as e:
|
||
logging.error(f"❌ Error generating quiz: {e}")
|
||
|
||
doc.save(filename)
|
||
|
||
def run_claude_summary_pipeline(text):
|
||
"""Genera bullet points, resumen integrado y formato final usando Claude CLI con chunks."""
|
||
|
||
# Validar que el texto tenga contenido suficiente
|
||
if not text or len(text.strip()) < 50:
|
||
logging.warning("⚠️ Texto demasiado corto para generar resumen, usando contenido por defecto")
|
||
text = "Contenido educativo procesado. Se generó un documento editable a partir de un archivo PDF."
|
||
|
||
# Dividir texto en partes si es muy largo
|
||
max_chunk_size = 6000 # Caracteres por chunk (más grande para Claude)
|
||
if len(text) > max_chunk_size:
|
||
logging.info(f"📝 Dividiendo texto de {len(text)} caracteres en chunks de {max_chunk_size}")
|
||
text_chunks = []
|
||
|
||
# Dividir por párrafos para mantener coherencia
|
||
paragraphs = text.split('\n\n')
|
||
current_chunk = ""
|
||
|
||
for paragraph in paragraphs:
|
||
if len(current_chunk + paragraph) <= max_chunk_size:
|
||
current_chunk += paragraph + "\n\n"
|
||
else:
|
||
if current_chunk.strip():
|
||
text_chunks.append(current_chunk.strip())
|
||
current_chunk = paragraph + "\n\n"
|
||
|
||
if current_chunk.strip():
|
||
text_chunks.append(current_chunk.strip())
|
||
|
||
logging.info(f"📝 Texto dividido en {len(text_chunks)} partes")
|
||
else:
|
||
text_chunks = [text]
|
||
|
||
logging.info("🔹 Claude CLI generando bullet points por partes...")
|
||
|
||
# Generar bullet points para cada chunk usando Claude CLI
|
||
all_bullets = []
|
||
for i, chunk in enumerate(text_chunks):
|
||
logging.info(f"🔹 Procesando chunk {i+1}/{len(text_chunks)} con Claude CLI...")
|
||
|
||
bullet_prompt = f"""Analiza el siguiente texto y extrae entre 5 y 8 bullet points clave en español.
|
||
|
||
REGLAS ESTRICTAS:
|
||
1. Devuelve ÚNICAMENTE bullet points, cada línea iniciando con "- "
|
||
2. Cada bullet debe ser conciso (12-20 palabras) y resaltar datos, fechas, conceptos o conclusiones importantes
|
||
3. NO agregues introducciones, conclusiones ni texto explicativo
|
||
4. Concéntrate en los puntos más importantes del texto
|
||
5. Incluye fechas, datos específicos y nombres relevantes si los hay
|
||
|
||
Texto (parte {i+1} de {len(text_chunks)}):
|
||
{chunk}"""
|
||
|
||
try:
|
||
chunk_bullets = run_claude_cli(bullet_prompt, timeout=300)
|
||
logging.info(f"✅ Claude CLI responded successfully for chunk {i+1}")
|
||
|
||
except subprocess.TimeoutExpired:
|
||
logging.warning(f"⚠️ Claude CLI timeout for chunk {i+1}, usando fallback")
|
||
chunk_bullets = f"- Punto principal de la sección {i+1}\n- Concepto secundario importante\n- Información relevante extraída\n- Datos significativos del texto\n- Conclusiones clave"
|
||
except Exception as e:
|
||
logging.warning(f"⚠️ Claude CLI error for chunk {i+1}: {e}")
|
||
chunk_bullets = f"- Punto principal de la sección {i+1}\n- Concepto secundario importante\n- Información relevante extraída\n- Datos significativos del texto\n- Conclusiones clave"
|
||
|
||
# Procesar bullets del chunk
|
||
for line in chunk_bullets.split('\n'):
|
||
line = line.strip()
|
||
if line.startswith('-') or line.startswith('•'):
|
||
bullet = '- ' + line.lstrip('-• ').strip()
|
||
if len(bullet) > 10: # Ignorar bullets muy cortos
|
||
all_bullets.append(bullet)
|
||
|
||
bullet_count = len([b for b in chunk_bullets.split('\n') if b.strip()])
|
||
logging.info(f"✅ Chunk {i+1} procesado: {bullet_count} bullets")
|
||
|
||
# Limitar bullets totales y eliminar duplicados
|
||
unique_bullets = []
|
||
seen = set()
|
||
for bullet in all_bullets[:15]: # Máximo 15 bullets
|
||
bullet_clean = bullet.lower().strip()
|
||
if bullet_clean not in seen and len(bullet_clean) > 15:
|
||
unique_bullets.append(bullet)
|
||
seen.add(bullet_clean)
|
||
|
||
claude_bullets = "\n".join(unique_bullets)
|
||
logging.info(f"✅ Total de {len(unique_bullets)} bullets únicos generados con Claude CLI")
|
||
|
||
logging.info("🔸 Claude CLI generando resumen integrado...")
|
||
|
||
# Para el resumen, usar una versión condensada del texto si es muy largo
|
||
if len(text) > 6000:
|
||
summary_text = text[:6000] + "\n\n[El documento continúa con contenido adicional...]"
|
||
logging.info("📝 Usando versión condensada del texto para el resumen")
|
||
else:
|
||
summary_text = text
|
||
|
||
summary_prompt = f"""Eres un profesor universitario experto en historia del siglo XX. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.
|
||
|
||
REQUISITOS ESTRICTOS:
|
||
- Extensión entre 500-700 palabras
|
||
- Usa encabezados Markdown con jerarquía clara (##, ###)
|
||
- Desarrolla los puntos clave con profundidad y contexto histórico
|
||
- Mantén un tono académico y analítico
|
||
- Incluye conclusiones significativas
|
||
- NO agregues texto fuera del resumen
|
||
- Devuelve únicamente el resumen en formato Markdown
|
||
|
||
Bullet points extraídos:
|
||
{claude_bullets}
|
||
|
||
Texto original (resumido si es muy extenso):
|
||
{summary_text}
|
||
|
||
Responde únicamente con el resumen en Markdown."""
|
||
|
||
try:
|
||
summary_output = run_claude_cli(summary_prompt, timeout=300)
|
||
logging.info("✅ Resumen integrado generado por Claude CLI")
|
||
except subprocess.TimeoutExpired:
|
||
logging.warning("⚠️ Claude CLI timeout for summary, usando fallback")
|
||
summary_output = f"""# Resumen del Documento
|
||
|
||
## Puntos Principales
|
||
- El documento ha sido procesado exitosamente
|
||
- Se extrajo el contenido textual del PDF original
|
||
- El material está disponible en formato editable
|
||
|
||
## Información Relevante
|
||
El texto procesado contiene información académica sobre el período histórico analizado.
|
||
|
||
## Conclusiones
|
||
El documento está disponible en formato DOCX para su posterior edición y análisis."""
|
||
logging.info("✅ Resumen fallback generado")
|
||
except Exception as e:
|
||
logging.warning(f"⚠️ Claude CLI error for summary: {e}")
|
||
summary_output = f"""# Resumen del Documento
|
||
|
||
## Puntos Principales
|
||
- El documento ha sido procesado exitosamente
|
||
- Se extrajo el contenido textual del PDF original
|
||
- El material está disponible en formato editable
|
||
|
||
## Información Relevante
|
||
El texto procesado contiene información académica sobre el período histórico analizado.
|
||
|
||
## Conclusiones
|
||
El documento está disponible en formato DOCX para su posterior edición y análisis."""
|
||
|
||
logging.info("🔶 Claude CLI aplicando formato final...")
|
||
format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible:
|
||
|
||
{summary_output}
|
||
|
||
Instrucciones:
|
||
- Corrige cualquier error de formato
|
||
- Asegúrate de que los encabezados estén bien espaciados
|
||
- Verifica que las viñetas usen "- " correctamente
|
||
- Mantén exactamente el contenido existente
|
||
- Devuelve únicamente el resumen formateado sin texto adicional"""
|
||
|
||
try:
|
||
formatted_output = run_claude_cli(format_prompt, timeout=180)
|
||
logging.info("✅ Formato final aplicado por Claude CLI")
|
||
except Exception as e:
|
||
logging.warning(f"⚠️ Claude CLI formatting error: {e}")
|
||
formatted_output = summary_output
|
||
|
||
return True, claude_bullets, summary_output, formatted_output
|
||
|
||
# Mantener la función original para compatibilidad
|
||
def run_gemini_summary_pipeline(text):
|
||
"""Compatibilidad: usa GLM-4.6 vía Claude CLI."""
|
||
return run_claude_summary_pipeline(text)
|
||
|
||
|
||
def generate_unified_summary(local_txt_path, base_name):
|
||
"""Genera resumen en flujo TXT → MD → DOCX → PDF usando GLM-4.6."""
|
||
with open(local_txt_path, "r", encoding="utf-8") as f:
|
||
text = f.read()
|
||
|
||
logging.info("🤖 Iniciando síntesis colaborativa con GLM-4.6 (z.ai)...")
|
||
send_telegram_message("Iniciando resumen colaborativo con GLM-4.6 (z.ai)")
|
||
|
||
success, bullet_points, raw_summary, formatted_summary = run_gemini_summary_pipeline(text)
|
||
if not success:
|
||
return False, None, {}
|
||
|
||
summary_content = (formatted_summary or "").strip()
|
||
if not summary_content:
|
||
summary_content = "\n\n".join(filter(None, [bullet_points, raw_summary])).strip()
|
||
if not summary_content:
|
||
summary_content = text.strip()
|
||
|
||
summary_content = summary_content or "Resumen no disponible"
|
||
|
||
intelligent_filename = generate_intelligent_filename(base_name, summary_content)
|
||
intelligent_filename = ensure_unique_local_filename(Path(LOCAL_DOWNLOADS_PATH), intelligent_filename)
|
||
docx_path = Path(LOCAL_DOWNLOADS_PATH) / intelligent_filename
|
||
|
||
markdown_filename = Path(intelligent_filename).with_suffix('.md').name
|
||
markdown_path = docx_path.with_suffix('.md')
|
||
with open(markdown_path, 'w', encoding='utf-8') as markdown_file:
|
||
markdown_file.write(summary_content)
|
||
|
||
logging.info(f"📝 Guardando resumen Markdown en {markdown_path}")
|
||
|
||
markdown_to_docx(summary_content, docx_path, quiz_source=summary_content)
|
||
logging.info(f"✅ Documento DOCX generado: {docx_path}")
|
||
|
||
pdf_path = docx_path.with_suffix('.pdf')
|
||
pdf_created = True
|
||
try:
|
||
markdown_to_pdf(summary_content, pdf_path, title=docx_path.stem)
|
||
logging.info(f"✅ PDF generado: {pdf_path}")
|
||
except Exception as pdf_error:
|
||
pdf_created = False
|
||
logging.error(f"❌ Error generando PDF: {pdf_error}")
|
||
|
||
send_telegram_message(f"✅ Resumen colaborativo GLM-4.6 completado: {intelligent_filename}")
|
||
|
||
output_files = {
|
||
'docx_path': str(docx_path),
|
||
'docx_name': intelligent_filename,
|
||
'markdown_path': str(markdown_path),
|
||
'markdown_name': markdown_filename,
|
||
'pdf_path': str(pdf_path) if pdf_created else None,
|
||
'pdf_name': pdf_path.name if pdf_created else None,
|
||
}
|
||
|
||
return True, summary_content, output_files
|
||
|
||
def generate_summaries_from_text(local_txt_path, base_name):
|
||
"""Generate unified summary using 3 AI models in collaboration"""
|
||
return generate_unified_summary(local_txt_path, base_name)
|
||
|
||
# --- PDF PROCESSING FUNCTIONS ---
|
||
def preprocess_image(img):
|
||
"""Preprocesa la imagen para mejorar la calidad del OCR."""
|
||
try:
|
||
img_np = np.array(img)
|
||
gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
|
||
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 10)
|
||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
|
||
contrast = clahe.apply(binary)
|
||
denoised = cv2.fastNlMeansDenoising(contrast, None, 30, 7, 21)
|
||
return denoised
|
||
except Exception as e:
|
||
logging.error(f"Error en preprocesamiento de imagen: {e}")
|
||
return np.array(img)
|
||
|
||
|
||
def normalize_pdf_extracted_text(text):
|
||
"""Normaliza texto extraído directamente de un PDF manteniendo saltos de línea útiles."""
|
||
if not text:
|
||
return ''
|
||
|
||
allowed_controls = {'\n', '\r', '\t'}
|
||
filtered_chars = [
|
||
char for char in text
|
||
if unicodedata.category(char)[0] != 'C' or char in allowed_controls
|
||
]
|
||
cleaned = ''.join(filtered_chars)
|
||
cleaned = cleaned.replace('\r\n', '\n').replace('\r', '\n')
|
||
cleaned = re.sub(r'[ \t]+', ' ', cleaned)
|
||
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
||
return cleaned.strip()
|
||
|
||
|
||
def extract_pdf_text_if_text_based(reader, filename):
|
||
"""Intenta detectar y devolver texto directo si el PDF no es escaneado."""
|
||
total_pages = len(reader.pages)
|
||
if total_pages == 0:
|
||
return None, 0.0, 0.0
|
||
|
||
page_texts = []
|
||
text_pages = 0
|
||
total_chars = 0
|
||
|
||
for index, page in enumerate(reader.pages):
|
||
try:
|
||
raw_text = page.extract_text() or ''
|
||
except Exception as exc:
|
||
logging.debug(
|
||
"Error extrayendo texto de la página %s de %s: %s",
|
||
index + 1,
|
||
filename,
|
||
exc,
|
||
)
|
||
raw_text = ''
|
||
|
||
normalized = normalize_pdf_extracted_text(raw_text)
|
||
if normalized:
|
||
text_pages += 1
|
||
total_chars += len(normalized)
|
||
page_texts.append(normalized)
|
||
|
||
ratio = text_pages / total_pages if total_pages else 0.0
|
||
avg_chars = (total_chars / text_pages) if text_pages else 0.0
|
||
|
||
if text_pages and ratio >= PDF_TEXT_DETECTION_MIN_RATIO and avg_chars >= PDF_TEXT_DETECTION_MIN_AVG_CHARS:
|
||
logging.info(
|
||
"📑 PDF '%s' detectado como basado en texto (ratio=%.0f%%, avg_chars=%.0f).",
|
||
filename,
|
||
ratio * 100,
|
||
avg_chars,
|
||
)
|
||
return page_texts, ratio, avg_chars
|
||
|
||
logging.debug(
|
||
"PDF '%s' requiere OCR (ratio=%.0f%%, avg_chars=%.0f).",
|
||
filename,
|
||
ratio * 100,
|
||
avg_chars,
|
||
)
|
||
return None, ratio, avg_chars
|
||
|
||
|
||
def process_pdf_file(input_pdf_path, output_docx_path):
|
||
"""Main workflow for processing a single PDF file."""
|
||
pdf_filename = os.path.basename(input_pdf_path)
|
||
send_telegram_message(f"⚙️ Iniciando procesamiento de PDF: {pdf_filename}")
|
||
temp_dir = f"temp_pdf_chunks_{pdf_filename}"
|
||
|
||
if not os.path.isfile(input_pdf_path):
|
||
logging.error(f"Input file not found: {input_pdf_path}")
|
||
raise FileNotFoundError(f"Input file not found: {input_pdf_path}")
|
||
|
||
try:
|
||
logging.info(f"Processing: {pdf_filename}")
|
||
reader = PdfReader(input_pdf_path)
|
||
num_pages = len(reader.pages)
|
||
|
||
direct_text_pages, text_ratio, avg_chars = extract_pdf_text_if_text_based(reader, pdf_filename)
|
||
all_corrected_texts = []
|
||
|
||
if direct_text_pages is not None:
|
||
logging.info(
|
||
"Usando extracción directa de texto para '%s' (ratio=%.0f%%, avg_chars=%.0f).",
|
||
pdf_filename,
|
||
text_ratio * 100,
|
||
avg_chars,
|
||
)
|
||
send_telegram_message(f"📑 Texto incrustado detectado, evitando OCR para: {pdf_filename}")
|
||
raw_text_content = f"\n\n{_PAGE_BREAK_TOKEN}\n\n".join(direct_text_pages)
|
||
if raw_text_content.strip():
|
||
# Para PDFs con texto, NO aplicar corrección con GLM - usar texto directo
|
||
all_corrected_texts.append(raw_text_content)
|
||
else:
|
||
logging.info(
|
||
"Realizando OCR completo para '%s' (ratio=%.0f%%, avg_chars=%.0f).",
|
||
pdf_filename,
|
||
text_ratio * 100,
|
||
avg_chars,
|
||
)
|
||
|
||
# Para OCR, dividir en chunks solo si es necesario
|
||
pdf_chunks = []
|
||
if num_pages > MAX_PAGES_PER_CHUNK:
|
||
logging.info(f"PDF requires OCR and has {num_pages} pages. Splitting into chunks of {MAX_PAGES_PER_CHUNK}.")
|
||
os.makedirs(temp_dir, exist_ok=True)
|
||
for i in range(0, num_pages, MAX_PAGES_PER_CHUNK):
|
||
writer = PdfWriter()
|
||
chunk_end = min(i + MAX_PAGES_PER_CHUNK, num_pages)
|
||
for j in range(i, chunk_end):
|
||
writer.add_page(reader.pages[j])
|
||
chunk_path = os.path.join(temp_dir, f"chunk_{i // MAX_PAGES_PER_CHUNK}.pdf")
|
||
with open(chunk_path, "wb") as f:
|
||
writer.write(f)
|
||
pdf_chunks.append(chunk_path)
|
||
send_telegram_message(f"📄 PDF split into {len(pdf_chunks)} parts for OCR processing.")
|
||
else:
|
||
pdf_chunks.append(input_pdf_path)
|
||
|
||
ocr_reader, trocr_models = get_ocr_models()
|
||
|
||
for idx, chunk_path in enumerate(pdf_chunks):
|
||
logging.info(f"--- Processing chunk {idx + 1}/{len(pdf_chunks)} ---")
|
||
send_telegram_message(f"🧠 OCR with GPU processing part {idx + 1}/{len(pdf_chunks)} of {pdf_filename}...")
|
||
|
||
_update_models_usage()
|
||
|
||
images = convert_from_path(chunk_path, dpi=PDF_DPI, thread_count=PDF_RENDER_THREAD_COUNT)
|
||
full_text_raw = []
|
||
|
||
if not images:
|
||
logging.warning(f"No se generaron imágenes para el chunk {idx + 1}")
|
||
continue
|
||
|
||
batch_size = max(1, min(PDF_BATCH_SIZE, len(images)))
|
||
logging.info(
|
||
f"⚙️ Config GPU PDF -> render_threads={PDF_RENDER_THREAD_COUNT}, "
|
||
f"batch_size={batch_size}, trocr_max_batch={PDF_TROCR_MAX_BATCH}"
|
||
)
|
||
|
||
def _tesseract_ocr(img_np):
|
||
return pytesseract.image_to_string(img_np, lang='spa')
|
||
|
||
with ThreadPoolExecutor(max_workers=PDF_PREPROCESS_THREADS) as preprocess_pool, \
|
||
ThreadPoolExecutor(max_workers=PDF_TESSERACT_THREADS) as tess_pool:
|
||
for i in range(0, len(images), batch_size):
|
||
batch_images = images[i:i + batch_size]
|
||
|
||
_update_models_usage()
|
||
|
||
preprocessed_batch = list(preprocess_pool.map(preprocess_image, batch_images))
|
||
|
||
try:
|
||
easy_results = ocr_reader.readtext_batched(
|
||
preprocessed_batch,
|
||
detail=1,
|
||
batch_size=len(preprocessed_batch)
|
||
)
|
||
except AttributeError:
|
||
easy_results = [
|
||
ocr_reader.readtext(img_data, detail=1, batch_size=len(preprocessed_batch))
|
||
for img_data in preprocessed_batch
|
||
]
|
||
except Exception as e:
|
||
logging.error(f"Error en EasyOCR batched: {e}, usando fallback secuencial")
|
||
easy_results = [
|
||
ocr_reader.readtext(img_data, detail=1, batch_size=len(preprocessed_batch))
|
||
for img_data in preprocessed_batch
|
||
]
|
||
|
||
tess_texts = list(tess_pool.map(_tesseract_ocr, preprocessed_batch))
|
||
|
||
if (not isinstance(trocr_models, dict) or
|
||
trocr_models.get('processor') is None or
|
||
trocr_models.get('model') is None):
|
||
logging.info("♻️ TrOCR models were freed, reloading before OCR batch")
|
||
_, trocr_models = get_ocr_models()
|
||
|
||
trocr_texts = trocr_ocr_batch(
|
||
batch_images,
|
||
trocr_models['processor'],
|
||
trocr_models['model'],
|
||
max_batch_size=PDF_TROCR_MAX_BATCH
|
||
)
|
||
|
||
for img_idx, img_preprocessed in enumerate(preprocessed_batch):
|
||
easy_text = ''
|
||
if easy_results and img_idx < len(easy_results):
|
||
easy_text = ' '.join([line[1] for line in easy_results[img_idx]])
|
||
|
||
text_tess = tess_texts[img_idx] if img_idx < len(tess_texts) else ''
|
||
text_trocr = trocr_texts[img_idx] if img_idx < len(trocr_texts) else ''
|
||
|
||
combined_parts = [part for part in (easy_text, text_tess, text_trocr) if part]
|
||
combined_text = '\n'.join(combined_parts)
|
||
full_text_raw.append(clean_text(combined_text))
|
||
|
||
raw_text_content = f"\n\n{_PAGE_BREAK_TOKEN}\n\n".join(full_text_raw)
|
||
if not raw_text_content.strip():
|
||
logging.warning(f"Chunk {idx + 1} no produjo texto significativo tras OCR")
|
||
continue
|
||
corrected_chunk_text = gemini_correct_text(raw_text_content)
|
||
all_corrected_texts.append(corrected_chunk_text)
|
||
|
||
final_text = "\n\n".join(text for text in all_corrected_texts if text)
|
||
if not final_text.strip():
|
||
raise ValueError("No se pudo extraer texto del PDF.")
|
||
|
||
# Para PDFs con texto, no aplicar formateo con GLM
|
||
if direct_text_pages is not None:
|
||
formatted_text = final_text # Usar texto directo sin formato adicional
|
||
else:
|
||
# Solo para OCR, aplicar formateo con GLM
|
||
formatted_text = format_text_with_gemini_for_docx(final_text, pdf_filename)
|
||
|
||
doc = Document()
|
||
doc.add_heading(f"Documento Editable: {pdf_filename}", level=1)
|
||
add_markdown_content_to_document(doc, formatted_text)
|
||
doc.save(output_docx_path)
|
||
# Determinar tipo de procesamiento para el mensaje
|
||
if direct_text_pages is not None:
|
||
send_telegram_message(f"✅ PDF with embedded text processed and saved as DOCX: {os.path.basename(output_docx_path)}")
|
||
else:
|
||
send_telegram_message(f"✅ PDF with OCR processed and saved as DOCX: {os.path.basename(output_docx_path)}")
|
||
|
||
finally:
|
||
if os.path.exists(temp_dir):
|
||
logging.info(f"Cleaning up temporary directory: {temp_dir}")
|
||
shutil.rmtree(temp_dir)
|
||
|
||
def trocr_ocr_batch(pil_images, processor, model, max_batch_size=4):
|
||
"""Ejecuta OCR TrOCR sobre una lista de imágenes con manejo adaptativo GPU/CPU."""
|
||
if not pil_images:
|
||
return []
|
||
|
||
_update_models_usage()
|
||
|
||
def _refresh_trocr(proc, mdl, reason):
|
||
logging.info(f"♻️ TrOCR reload triggered ({reason})")
|
||
_, trocr_bundle = get_ocr_models()
|
||
if not isinstance(trocr_bundle, dict):
|
||
return None, None
|
||
return trocr_bundle.get('processor'), trocr_bundle.get('model')
|
||
|
||
refresh_reason = None
|
||
if processor is None or model is None:
|
||
refresh_reason = "models missing"
|
||
if refresh_reason:
|
||
processor, model = _refresh_trocr(processor, model, refresh_reason)
|
||
|
||
sample_param = None
|
||
attempts = 0
|
||
while attempts < 2:
|
||
if model is None:
|
||
processor, model = _refresh_trocr(processor, model, "model None on attempt")
|
||
if model is None:
|
||
attempts += 1
|
||
continue
|
||
try:
|
||
sample_param = next(model.parameters())
|
||
break
|
||
except (AttributeError, StopIteration):
|
||
logging.warning("TrOCR model parameters unavailable, forcing reload")
|
||
processor, model = _refresh_trocr(processor, model, "no parameters")
|
||
attempts += 1
|
||
|
||
if sample_param is None:
|
||
raise RuntimeError("TrOCR model parameters unavailable after reload attempts")
|
||
|
||
device = sample_param.device
|
||
is_gpu = device.type == 'cuda'
|
||
dtype = sample_param.dtype
|
||
results = []
|
||
|
||
# Reducir batch size en CPU para mayor eficiencia
|
||
if is_gpu:
|
||
batch_size = max(1, min(max_batch_size, len(pil_images)))
|
||
else:
|
||
batch_size = max(1, min(2, len(pil_images))) # Batch size más pequeño para CPU
|
||
|
||
start_idx = 0
|
||
|
||
while start_idx < len(pil_images):
|
||
end_idx = min(start_idx + batch_size, len(pil_images))
|
||
current_batch = pil_images[start_idx:end_idx]
|
||
|
||
try:
|
||
with torch.inference_mode():
|
||
pixel_values = processor(images=current_batch, return_tensors="pt").pixel_values
|
||
pixel_values = pixel_values.to(device)
|
||
if pixel_values.dtype != dtype:
|
||
pixel_values = pixel_values.to(dtype=dtype)
|
||
|
||
generated_ids = model.generate(pixel_values, max_length=512)
|
||
decoded = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
||
results.extend(decoded)
|
||
start_idx = end_idx
|
||
|
||
except RuntimeError as e:
|
||
if "out of memory" in str(e).lower() and is_gpu and batch_size > 1:
|
||
logging.warning(f"⚠️ TrOCR OOM con batch_size={batch_size}, reduciendo a {batch_size // 2}")
|
||
torch.cuda.empty_cache()
|
||
batch_size = max(1, batch_size // 2)
|
||
continue
|
||
else:
|
||
logging.error(f"❌ Error en TrOCR batch: {e}")
|
||
results.extend([""] * len(current_batch))
|
||
start_idx = end_idx
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error inesperado en TrOCR batch: {e}")
|
||
results.extend([""] * len(current_batch))
|
||
start_idx = end_idx
|
||
|
||
# Pequeña pausa en CPU para no sobrecargar
|
||
if not is_gpu and start_idx < len(pil_images):
|
||
time.sleep(0.1)
|
||
|
||
return results
|
||
|
||
def clean_text(text):
|
||
"""Limpia y normaliza el texto extraído."""
|
||
text = ''.join(c for c in text if unicodedata.category(c)[0] != 'C')
|
||
text = re.sub(r'\s+', ' ', text)
|
||
text = unicodedata.normalize('NFKC', text)
|
||
return text
|
||
|
||
_PAGE_BREAK_TOKEN = "[[PAGE_BREAK]]"
|
||
_LEGACY_PAGE_BREAK_PATTERN = re.compile(r'-{3,}\s*Nueva Página\s*-{3,}', re.IGNORECASE)
|
||
|
||
|
||
def format_text_with_gemini_for_docx(text, pdf_filename):
|
||
"""Solicita a GLM-4.6 que añada títulos/subtítulos sin alterar el contenido."""
|
||
if not text:
|
||
return text
|
||
|
||
if not GEMINI_AVAILABLE:
|
||
logging.debug("GLM-4.6 no disponible para formateo DOCX, se usa texto sin cambios.")
|
||
return text
|
||
|
||
prompt = (
|
||
"Eres un asistente editorial que trabaja sobre el contenido íntegro de un PDF ya corregido. "
|
||
"Tu tarea es devolver EXACTAMENTE el mismo texto, sin resumir, omitir ni reescribir frases. "
|
||
"Solo puedes insertar títulos y subtítulos descriptivos que ayuden a estructurar el documento.\n\n"
|
||
"Instrucciones estrictas:\n"
|
||
"- Usa formato Markdown simple: `# Título principal` y `## Subtítulo`. No utilices niveles adicionales.\n"
|
||
f"- Mantén el marcador literal {_PAGE_BREAK_TOKEN} cuando aparezca; equivale a un salto de página.\n"
|
||
"- Conserva el orden y la redacción original de todos los párrafos.\n"
|
||
"- No agregues listas, viñetas, comentarios ni explicaciones extra.\n"
|
||
"- Responde únicamente con el contenido formateado. Nada de prefacios ni notas.\n\n"
|
||
f"Nombre del archivo: {pdf_filename}\n\n"
|
||
"Contenido:\n"
|
||
"<<<INICIO>>>\n"
|
||
f"{text}\n"
|
||
"<<<FIN>>>"
|
||
)
|
||
|
||
formatted = run_gemini(prompt, use_flash=True)
|
||
if not formatted or not formatted.strip():
|
||
logging.error("GLM-4.6 devolvió una respuesta vacía para el formato DOCX")
|
||
return text
|
||
|
||
if formatted.lower().startswith("error"):
|
||
logging.error(f"GLM-4.6 no pudo formatear el documento: {formatted}")
|
||
return text
|
||
|
||
return formatted.strip()
|
||
|
||
|
||
def add_markdown_content_to_document(doc, content):
|
||
"""Convierte la salida Markdown generada por GLM-4.6 en párrafos y encabezados DOCX."""
|
||
if not content:
|
||
return
|
||
|
||
normalized = content.replace(_PAGE_BREAK_TOKEN, f"\n{_PAGE_BREAK_TOKEN}\n")
|
||
normalized = _LEGACY_PAGE_BREAK_PATTERN.sub(f"\n{_PAGE_BREAK_TOKEN}\n", normalized)
|
||
|
||
buffer = []
|
||
|
||
def flush_buffer():
|
||
if buffer:
|
||
paragraph_text = ' '.join(line.strip() for line in buffer if line.strip())
|
||
if paragraph_text:
|
||
doc.add_paragraph(paragraph_text)
|
||
buffer.clear()
|
||
|
||
for line in normalized.splitlines():
|
||
stripped = line.strip()
|
||
|
||
if not stripped:
|
||
flush_buffer()
|
||
continue
|
||
|
||
if stripped == _PAGE_BREAK_TOKEN:
|
||
flush_buffer()
|
||
doc.add_page_break()
|
||
continue
|
||
|
||
if stripped.startswith('## '):
|
||
flush_buffer()
|
||
doc.add_heading(stripped[3:].strip(), level=3)
|
||
continue
|
||
|
||
if stripped.startswith('# '):
|
||
flush_buffer()
|
||
doc.add_heading(stripped[2:].strip(), level=2)
|
||
continue
|
||
|
||
buffer.append(line)
|
||
|
||
flush_buffer()
|
||
|
||
|
||
def gemini_correct_text(text):
|
||
"""Usa la API de GLM-4.6 para corregir y reconstruir el texto."""
|
||
if not (GEMINI_CLI_PATH or GEMINI_API_KEY or CLAUDE_CLI_PATH):
|
||
logging.debug("GLM-4.6 no disponible para corrección, se mantiene el texto original.")
|
||
return text
|
||
|
||
prompt = f'''Corrige y reconstruye el siguiente texto extraído por OCR de un documento PDF. El texto puede contener errores, palabras mal escritas, frases incompletas o desordenadas. Tu tarea es devolver únicamente el texto corregido, limpio, coherente y bien estructurado en español. No incluyas explicaciones, preámbulos ni formato adicional. Solo el texto final y legible:
|
||
|
||
--- INICIO DEL TEXTO ---
|
||
{text}
|
||
--- FIN DEL TEXTO ---'''
|
||
|
||
try:
|
||
corrected_text = run_gemini(prompt, use_flash=True)
|
||
if not corrected_text or not corrected_text.strip():
|
||
return text
|
||
|
||
normalized = corrected_text.lstrip()
|
||
if normalized.lower().startswith("error"):
|
||
return text
|
||
|
||
return corrected_text
|
||
except Exception as e:
|
||
logging.error(f"Error en la llamada a la API de GLM-4.6: {e}")
|
||
return text
|
||
|
||
def get_ocr_models():
|
||
"""Carga y cachea los modelos OCR para mejorar rendimiento con sistema de timeout - USA GPU/CPU ADAPTATIVO"""
|
||
global _ocr_models, _trocr_models
|
||
|
||
# Actualizar timestamp de uso
|
||
_update_models_usage()
|
||
|
||
# Múltiples intentos para cargar modelos con reintento
|
||
max_retries = 3
|
||
use_gpu = torch.cuda.is_available()
|
||
|
||
# Verificar memoria disponible si hay GPU
|
||
if use_gpu:
|
||
try:
|
||
total_memory = torch.cuda.get_device_properties(0).total_memory
|
||
allocated_memory = torch.cuda.memory_allocated(0)
|
||
free_memory = total_memory - allocated_memory
|
||
|
||
# Si menos de 1.5GB libre, forzar CPU
|
||
if free_memory < 1.5 * 1024**3:
|
||
logging.warning(f"⚠️ Memoria GPU baja: {free_memory / 1024**3:.2f}GB libre, usando CPU")
|
||
use_gpu = False
|
||
send_telegram_message("🔄 Memoria GPU insuficiente, usando CPU para procesamiento PDF")
|
||
except:
|
||
use_gpu = False
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
if use_gpu:
|
||
logging.info(f"🚀 Loading OCR models on GPU (attempt {attempt + 1}/{max_retries})...")
|
||
else:
|
||
logging.info(f"💻 Loading OCR models on CPU (attempt {attempt + 1}/{max_retries})...")
|
||
|
||
# Limpiar VRAM antes de cargar si usamos GPU
|
||
if use_gpu:
|
||
torch.cuda.empty_cache()
|
||
import gc
|
||
gc.collect()
|
||
|
||
if attempt > 0:
|
||
force_free_vram()
|
||
time.sleep(2)
|
||
|
||
# Cargar EasyOCR con GPU/CPU adaptativo
|
||
if _ocr_models is None:
|
||
_ocr_models = easyocr.Reader(['es'], gpu=use_gpu, verbose=False)
|
||
logging.info(f"✅ EasyOCR loaded on {'GPU' if use_gpu else 'CPU'}")
|
||
|
||
# Cargar TrOCR con manejo de memoria mejorado
|
||
if _trocr_models is None:
|
||
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
|
||
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
|
||
|
||
if use_gpu:
|
||
try:
|
||
device = "cuda"
|
||
model = model.to(device)
|
||
model.eval()
|
||
|
||
# Activar FP16 para reducir uso de memoria
|
||
try:
|
||
major, _ = torch.cuda.get_device_capability(0)
|
||
if major >= 7:
|
||
model = model.half()
|
||
logging.info("⚡ TrOCR en FP16 habilitado")
|
||
except Exception as capability_error:
|
||
logging.warning(f"No se pudo habilitar FP16 en TrOCR: {capability_error}")
|
||
|
||
logging.info("✅ TrOCR model loaded on GPU")
|
||
except RuntimeError as e:
|
||
if "out of memory" in str(e).lower() and attempt < max_retries - 1:
|
||
logging.warning(f"⚠️ TrOCR OOM en GPU, reintentando con CPU...")
|
||
use_gpu = False
|
||
continue
|
||
else:
|
||
raise
|
||
else:
|
||
# Usar CPU directamente
|
||
device = "cpu"
|
||
model = model.to(device)
|
||
model.eval()
|
||
logging.info("✅ TrOCR model loaded on CPU")
|
||
|
||
_trocr_models = {
|
||
'processor': processor,
|
||
'model': model
|
||
}
|
||
|
||
# Update usage timestamp after loading models
|
||
_update_models_usage()
|
||
return _ocr_models, _trocr_models
|
||
|
||
except RuntimeError as e:
|
||
if "CUDA-capable device" in str(e) or "out of memory" in str(e).lower():
|
||
if use_gpu and attempt < max_retries - 1:
|
||
logging.error(f"❌ CUDA error en intento {attempt + 1}: {e}")
|
||
logging.info(f"🔄 Reintentando con CPU...")
|
||
use_gpu = False # Forzar CPU en siguiente intento
|
||
continue
|
||
else:
|
||
error_msg = f"❌ ERROR después de {max_retries} intentos: {e}"
|
||
logging.error(error_msg)
|
||
if attempt == max_retries - 1:
|
||
send_telegram_message(error_msg)
|
||
raise RuntimeError(error_msg)
|
||
else:
|
||
logging.error(f"❌ Error inesperado: {e}")
|
||
raise
|
||
|
||
# Si llegamos aquí, todos los intentos fallaron
|
||
error_msg = "❌ ERROR: No se pudieron cargar los modelos OCR"
|
||
logging.error(error_msg)
|
||
raise RuntimeError(error_msg)
|
||
|
||
# --- DOCUMENT CONVERSION FUNCTIONS ---
|
||
def docx_to_text(docx_path):
|
||
"""Convert DOCX to plain text"""
|
||
doc = Document(docx_path)
|
||
return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
|
||
|
||
def docx_to_markdown(docx_path):
|
||
"""Convert DOCX to Markdown format"""
|
||
doc = Document(docx_path)
|
||
md_lines = []
|
||
for para in doc.paragraphs:
|
||
text = para.text.strip()
|
||
if para.style.name.startswith('Heading'):
|
||
level = int(para.style.name.replace('Heading ', ''))
|
||
md_lines.append('#' * level + ' ' + text)
|
||
elif para.style.name == 'List Bullet':
|
||
md_lines.append(f"- {text}")
|
||
else:
|
||
md_lines.append(text)
|
||
return '\n'.join(md_lines)
|
||
|
||
def summarize_text_with_gemini(text):
|
||
"""Summarize text using the GLM-4.6 pipeline (compatibilidad)."""
|
||
success, _, _, formatted_summary = run_gemini_summary_pipeline(text)
|
||
if not success or not formatted_summary:
|
||
raise RuntimeError("GLM-4.6 no pudo generar el resumen solicitado")
|
||
return formatted_summary
|
||
|
||
|
||
# --- MAIN PROCESSING FUNCTIONS ---
|
||
def process_audio_file(file_path):
|
||
"""Process a single audio file"""
|
||
filename = os.path.basename(file_path)
|
||
send_telegram_message(
|
||
f"🎵 Nuevo audio detectado: {filename}\n"
|
||
f"🤖 Flujo activado:\n"
|
||
f"• GLM-4.6: puntos clave + resumen integrado\n"
|
||
f"• GLM-4.6: formato final"
|
||
)
|
||
|
||
base_name = os.path.splitext(filename)[0]
|
||
local_audio_path = os.path.join(LOCAL_DOWNLOADS_PATH, filename)
|
||
local_txt_path = os.path.join(LOCAL_DOWNLOADS_PATH, f"{base_name}.txt")
|
||
|
||
try:
|
||
send_telegram_message(f"⬇️ Descargando audio: {filename}")
|
||
webdav_download(file_path, local_audio_path)
|
||
|
||
send_telegram_message(f"📝 Iniciando transcripción de audio: {filename}")
|
||
transcribe_audio(local_audio_path, local_txt_path)
|
||
|
||
# Generate unified summary
|
||
result = generate_unified_summary(local_txt_path, base_name)
|
||
if result and result[0]:
|
||
success, summary_content, output_files = result
|
||
|
||
docx_path = Path(output_files.get('docx_path', '')) if output_files else None
|
||
markdown_path = Path(output_files.get('markdown_path', '')) if output_files else None
|
||
pdf_path_str = output_files.get('pdf_path') if output_files else None
|
||
pdf_path = Path(pdf_path_str) if pdf_path_str else None
|
||
docx_filename = output_files.get('docx_name') if output_files else None
|
||
|
||
if docx_path and docx_path.exists() and docx_filename:
|
||
ensure_thematic_folders_exist()
|
||
|
||
logging.info("🧠 Clasificando contenido inteligentemente...")
|
||
category = classify_content_intelligent(summary_content)
|
||
category_name = TEMATIC_FOLDERS.get(category, TEMATIC_FOLDERS["otras_clases"])
|
||
|
||
remote_docx_path = get_upload_path_for_category(category, docx_filename)
|
||
webdav_upload(str(docx_path), remote_docx_path)
|
||
logging.info(f"☁️ DOCX subido a {category_name}: {docx_filename}")
|
||
|
||
if pdf_path and pdf_path.exists():
|
||
pdf_name = pdf_path.name
|
||
remote_pdf_path = get_upload_path_for_category(category, pdf_name)
|
||
webdav_upload(str(pdf_path), remote_pdf_path)
|
||
logging.info(f"☁️ PDF subido a {category_name}: {pdf_name}")
|
||
|
||
try:
|
||
if markdown_path and markdown_path.exists():
|
||
remote_md_path = os.path.join('Notes', markdown_path.name)
|
||
webdav_upload(str(markdown_path), remote_md_path)
|
||
logging.info(f"Markdown subido a Notes: {markdown_path.name}")
|
||
except Exception as e:
|
||
logging.error(f"Error subiendo Markdown para {docx_filename}: {e}")
|
||
|
||
topics = extract_key_topics_from_text(summary_content)
|
||
topics_str = ' - '.join(topics[:2])
|
||
|
||
send_telegram_message(
|
||
f"☁️ ✅ Resumen GLM-4.6 clasificado y subido a '{category_name}'\n"
|
||
f"📄 {docx_filename}\n"
|
||
f"🧾 Recursos generados: DOCX, PDF y Markdown\n"
|
||
f"🧠 Temas: {topics_str}"
|
||
)
|
||
save_processed_file(file_path)
|
||
else:
|
||
raise Exception("Failed to generate summaries")
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error processing audio {filename}: {e}")
|
||
send_telegram_message(f"❌ Error processing audio {filename}: {e}")
|
||
|
||
def process_txt_file(file_path):
|
||
"""Process a single text file"""
|
||
filename = os.path.basename(file_path)
|
||
send_telegram_message(
|
||
f"📄 Nuevo texto detectado: {filename}\n"
|
||
f"🤖 Flujo activado:\n"
|
||
f"• GLM-4.6: puntos clave + resumen integrado\n"
|
||
f"• GLM-4.6: formato final"
|
||
)
|
||
|
||
base_name = os.path.splitext(filename)[0]
|
||
local_txt_path = os.path.join(LOCAL_DOWNLOADS_PATH, filename)
|
||
|
||
try:
|
||
send_telegram_message(f"⬇️ Descargando texto: {filename}")
|
||
webdav_download(file_path, local_txt_path)
|
||
|
||
# Generate unified summary
|
||
result = generate_unified_summary(local_txt_path, base_name)
|
||
if result and result[0]:
|
||
success, summary_content, output_files = result
|
||
|
||
docx_path = Path(output_files.get('docx_path', '')) if output_files else None
|
||
markdown_path = Path(output_files.get('markdown_path', '')) if output_files else None
|
||
pdf_path_str = output_files.get('pdf_path') if output_files else None
|
||
pdf_path = Path(pdf_path_str) if pdf_path_str else None
|
||
docx_filename = output_files.get('docx_name') if output_files else None
|
||
|
||
# Upload to Nextcloud with intelligent name
|
||
if docx_path and docx_path.exists():
|
||
remote_docx_path = f"{REMOTE_DOCX_AUDIO_FOLDER}/{docx_filename}"
|
||
webdav_upload(str(docx_path), remote_docx_path)
|
||
send_telegram_message(f"✅ Resumen DOCX subido: {docx_filename}")
|
||
|
||
if pdf_path and pdf_path.exists():
|
||
remote_pdf_filename = docx_filename.replace('.docx', '.pdf') if docx_filename else f"{base_name}.pdf"
|
||
remote_pdf_path = f"{RESUMENES_FOLDER}/{remote_pdf_filename}"
|
||
webdav_upload(str(pdf_path), remote_pdf_path)
|
||
|
||
if markdown_path and markdown_path.exists():
|
||
remote_md_filename = docx_filename.replace('.docx', '.md') if docx_filename else f"{base_name}.md"
|
||
remote_md_path = f"{RESUMENES_FOLDER}/{remote_md_filename}"
|
||
webdav_upload(str(markdown_path), remote_md_path)
|
||
|
||
send_telegram_message(
|
||
f"✅ Resumen completado: {filename}\n"
|
||
f"📄 DOCX: {REMOTE_DOCX_AUDIO_FOLDER}/{docx_filename if docx_filename else base_name}"
|
||
)
|
||
save_processed_file(file_path)
|
||
else:
|
||
raise Exception("Failed to generate summaries")
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error processing text {filename}: {e}")
|
||
send_telegram_message(f"❌ Error processing text {filename}: {e}")
|
||
|
||
def check_pdf_already_processed(file_path, filename, base_name):
|
||
"""Verificación inteligente para evitar reprocesamiento de PDFs"""
|
||
|
||
# 1. Verificar si el DOCX editable ya existe localmente
|
||
local_docx_path = os.path.join(LOCAL_DOWNLOADS_PATH, f"{base_name}_editable.docx")
|
||
if os.path.exists(local_docx_path):
|
||
logging.info(f"📋 DOCX editable ya existe localmente: {base_name}_editable.docx")
|
||
return True
|
||
|
||
# 2. Verificar si el DOCX editable ya existe en Nextcloud
|
||
try:
|
||
remote_docx_path = os.path.join(os.path.dirname(file_path), f"{base_name}_editable.docx")
|
||
response = requests.request(
|
||
"PROPFIND",
|
||
f"{WEBDAV_ENDPOINT}/{remote_docx_path}",
|
||
auth=HTTPBasicAuth(NEXTCLOUD_USER, NEXTCLOUD_PASS),
|
||
headers={"Depth": "0"},
|
||
timeout=5
|
||
)
|
||
if response.status_code == 207: # Multi-Status significa que existe
|
||
logging.info(f"☁️ DOCX editable ya existe en Nextcloud: {remote_docx_path}")
|
||
return True
|
||
except Exception as e:
|
||
logging.debug(f"No se pudo verificar existencia en Nextcloud: {e}")
|
||
|
||
# 3. Verificar si ya fue procesado (fallback)
|
||
processed_files = load_processed_files()
|
||
normalized_path = normalize_remote_path(file_path)
|
||
base_name_check = os.path.basename(normalized_path)
|
||
|
||
if (normalized_path in processed_files or
|
||
base_name_check in processed_files or
|
||
filename in processed_files):
|
||
logging.info(f"📋 PDF ya está en registro de procesados: {filename}")
|
||
return True
|
||
|
||
return False
|
||
|
||
def process_pdf_main(file_path):
|
||
"""Process a single PDF file - main handler"""
|
||
filename = os.path.basename(file_path)
|
||
|
||
base_name = os.path.splitext(filename)[0]
|
||
local_pdf_path = os.path.join(LOCAL_DOWNLOADS_PATH, filename)
|
||
local_docx_output_path = os.path.join(LOCAL_DOWNLOADS_PATH, f"{base_name}_editable.docx")
|
||
remote_docx_filename = f"{base_name}_editable.docx"
|
||
|
||
# VERIFICACIÓN INTELIGENTE ANTES DE PROCESAR
|
||
if check_pdf_already_processed(file_path, filename, base_name):
|
||
logging.info(f"⏭️ PDF ya procesado, omitiendo: {filename}")
|
||
return
|
||
|
||
send_telegram_message(f"📄 Nuevo PDF detectado para procesar: {filename}")
|
||
|
||
try:
|
||
logging.info(f"Downloading PDF: {filename}")
|
||
webdav_download(file_path, local_pdf_path)
|
||
|
||
logging.info(f"Starting OCR and correction processing for: {filename}")
|
||
process_pdf_file(local_pdf_path, local_docx_output_path)
|
||
|
||
# Upload the generated editable DOCX file
|
||
if os.path.exists(local_docx_output_path):
|
||
remote_docx_path = os.path.join(os.path.dirname(file_path), remote_docx_filename)
|
||
logging.info(f"Uploading editable document to Nextcloud: {remote_docx_filename}")
|
||
webdav_upload(local_docx_output_path, remote_docx_path)
|
||
send_telegram_message(f"📄☁️ Documento editable subido a Nextcloud para: {filename}")
|
||
|
||
# Marcar como procesado inmediatamente después de subir el DOCX editable
|
||
save_processed_file(file_path)
|
||
logging.info(f"✅ Archivo PDF marcado como procesado: {filename}")
|
||
|
||
# Generar resumen completo con GLM-4.6 para todos los PDFs (no bloquea el procesamiento)
|
||
try:
|
||
send_telegram_message(f"🤖 Generando resumen completo con GLM-4.6 para: {filename}")
|
||
|
||
docx_text = docx_to_text(local_docx_output_path)
|
||
|
||
# Usar el sistema de resumen unificado con GLM-4.6
|
||
success, bullet_points, raw_summary, formatted_summary = run_gemini_summary_pipeline(docx_text)
|
||
|
||
if success and formatted_summary:
|
||
# Crear documento DOCX con el resumen
|
||
summary_docx_path = os.path.join(LOCAL_DOWNLOADS_PATH, f"{base_name}_resumen_completo.docx")
|
||
|
||
doc = Document()
|
||
doc.add_heading('Resumen Completo Generado con GLM-4.6', level=1)
|
||
doc.add_paragraph(f'Documento original: {filename}')
|
||
doc.add_paragraph('')
|
||
|
||
# Añadir contenido formateado
|
||
lines = formatted_summary.split('\n')
|
||
current_paragraph = []
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
current_paragraph = []
|
||
continue
|
||
|
||
if line.startswith('#'):
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
current_paragraph = []
|
||
# Procesar encabezado
|
||
level = len(line) - len(line.lstrip('#'))
|
||
if level <= 6:
|
||
doc.add_heading(line.lstrip('#').strip(), level=level)
|
||
else:
|
||
current_paragraph.append(line)
|
||
elif line.startswith('-') or line.startswith('•'):
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
current_paragraph = []
|
||
doc.add_paragraph(line.lstrip('-•').strip(), style='List Bullet')
|
||
else:
|
||
current_paragraph.append(line)
|
||
|
||
if current_paragraph:
|
||
doc.add_paragraph(' '.join(current_paragraph))
|
||
|
||
# Generar quiz si hay contenido suficiente
|
||
try:
|
||
quiz_text = (bullet_points or "") + "\n\n" + (raw_summary or "")
|
||
if len(quiz_text.strip()) > 100:
|
||
questions, answers = generate_quiz(quiz_text)
|
||
if questions and answers:
|
||
add_quiz_to_docx(doc, questions, answers)
|
||
logging.info("✅ Quiz agregado al resumen del PDF")
|
||
except Exception as quiz_error:
|
||
logging.warning(f"No se pudo generar quiz para el PDF: {quiz_error}")
|
||
|
||
doc.save(summary_docx_path)
|
||
|
||
# Subir resumen DOCX a Nextcloud
|
||
remote_summary_path = os.path.join('Resumenes', f"{base_name}_resumen_completo.docx")
|
||
webdav_mkdir('Resumenes')
|
||
webdav_upload(summary_docx_path, remote_summary_path)
|
||
|
||
# También crear y subir versión Markdown
|
||
md_content = f"# Resumen: {filename}\n\n{formatted_summary}"
|
||
md_filename = f"{base_name}_resumen_completo.md"
|
||
local_md_path = os.path.join(LOCAL_DOWNLOADS_PATH, md_filename)
|
||
with open(local_md_path, 'w', encoding='utf-8') as f:
|
||
f.write(md_content)
|
||
remote_md_path = os.path.join('Notes', md_filename)
|
||
webdav_upload(local_md_path, remote_md_path)
|
||
|
||
send_telegram_message(f"✅ Resumen completo generado y subido para: {filename}\n📄 DOCX en Resumenes/\n📝 Markdown en Notes/")
|
||
logging.info(f"✅ Resumen completo generado y subido para {filename}")
|
||
|
||
else:
|
||
# Fallback: resumen simple si falla GLM-4.6
|
||
simple_summary = f"# Resumen de {filename}\n\nTexto procesado exitosamente. No se pudo generar resumen detallado."
|
||
md_filename = f"{base_name}_resumen_simple.md"
|
||
local_md_path = os.path.join(LOCAL_DOWNLOADS_PATH, md_filename)
|
||
with open(local_md_path, 'w', encoding='utf-8') as f:
|
||
f.write(simple_summary)
|
||
remote_md_path = os.path.join('Notes', md_filename)
|
||
webdav_upload(local_md_path, remote_md_path)
|
||
logging.warning(f"⚠️ Resumen simple generado para {filename}")
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error generando resumen para {filename}: {e}")
|
||
# No notificar error por Telegram para evitar spam
|
||
else:
|
||
logging.warning(f"Expected output file not found: {local_docx_output_path}")
|
||
# Si no se encontró el archivo, igual marcar como procesado para evitar bucles
|
||
save_processed_file(file_path)
|
||
logging.warning(f"⚠️ Archivo marcado como procesado sin DOCX: {filename}")
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error in conversion process for PDF {filename}: {e}")
|
||
key = f"pdf_process::{file_path}"
|
||
msg = f"❌ Error processing PDF {filename}: {e}"
|
||
if should_send_error(key, str(e)):
|
||
send_telegram_message(msg)
|
||
|
||
def acquire_lock():
|
||
"""Adquiere un bloqueo para evitar múltiples instancias"""
|
||
try:
|
||
lock_file = os.path.join(LOCAL_STATE_DIR, ".main_service.lock")
|
||
os.makedirs(os.path.dirname(lock_file), exist_ok=True)
|
||
|
||
lock_fd = open(lock_file, 'w')
|
||
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||
|
||
# Escribir PID en el archivo de lock
|
||
lock_fd.write(str(os.getpid()))
|
||
lock_fd.flush()
|
||
|
||
logging.info(f"🔒 Bloqueo adquirido. PID: {os.getpid()}")
|
||
return lock_fd
|
||
|
||
except (IOError, OSError) as e:
|
||
if e.errno == 11: # EAGAIN - Resource temporarily unavailable
|
||
logging.error("❌ Ya hay otra instancia del servicio corriendo")
|
||
sys.exit(1)
|
||
else:
|
||
logging.error(f"❌ Error adquiriendo bloqueo: {e}")
|
||
sys.exit(1)
|
||
|
||
|
||
def release_lock(lock_fd) -> None:
|
||
"""Libera el bloqueo de ejecución si está activo."""
|
||
if not lock_fd:
|
||
return
|
||
try:
|
||
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_UN)
|
||
except Exception as exc:
|
||
logging.warning(f"No se pudo liberar el bloqueo limpiamente: {exc}")
|
||
finally:
|
||
try:
|
||
lock_fd.close()
|
||
except Exception:
|
||
pass
|
||
|
||
# --- MAIN LOOP ---
|
||
def main():
|
||
"""Main application loop"""
|
||
# Adquirir bloqueo para evitar múltiples instancias
|
||
lock_fd = acquire_lock()
|
||
try:
|
||
logging.info("=== INICIO Nextcloud AI Service - Flujo GLM-4.6 (Claude CLI) ===")
|
||
logging.info("🤖 Configuración: GLM-4.6 (Claude CLI via z.ai) para puntos clave y resumen integral")
|
||
logging.info("📝 Modo de operación: Resúmenes colaborativos unificados")
|
||
logging.info("🔒 Servicio protegido contra múltiples instancias")
|
||
|
||
# Enviar mensaje de Telegram (no bloquear si falla)
|
||
try:
|
||
send_telegram_message(
|
||
"✨ Nextcloud AI Service Started ✨\n"
|
||
"🚀 Flujo GLM-4.6 activado:\n"
|
||
"• GLM-4.6: Puntos clave y resumen integrado\n"
|
||
"• GLM-4.6: Formato y entrega final"
|
||
)
|
||
except Exception as e:
|
||
logging.warning(f"No se pudo enviar mensaje de Telegram: {e}")
|
||
|
||
# Create necessary directories
|
||
ensure_local_directories()
|
||
|
||
# Inicializar timestamp y sistema de monitoreo de VRAM
|
||
logging.info("🚀 Iniciando sistema de monitoreo de VRAM...")
|
||
_update_models_usage() # Inicializar timestamp al inicio
|
||
_start_vram_cleanup_timer()
|
||
|
||
while True:
|
||
try:
|
||
logging.info("--- Polling for new files ---")
|
||
processed_files = load_processed_files()
|
||
|
||
# --- PROCESS PDFs FOR CONVERSION TO EDITABLE ---
|
||
try:
|
||
webdav_mkdir(REMOTE_PDF_FOLDER)
|
||
pdf_files = webdav_list(REMOTE_PDF_FOLDER)
|
||
for file_path in pdf_files:
|
||
normalized_path = normalize_remote_path(file_path)
|
||
base_name = os.path.basename(normalized_path)
|
||
filename = base_name
|
||
|
||
# Skip if not PDF or if it's already an editable PDF
|
||
if (
|
||
not normalized_path.lower().endswith('.pdf')
|
||
or '_editable.docx' in normalized_path.lower()
|
||
):
|
||
continue
|
||
|
||
# VERIFICACIÓN INTELIGENTE ANTES DE PROCESAR (doble seguridad)
|
||
base_name_no_ext = os.path.splitext(filename)[0]
|
||
if check_pdf_already_processed(normalized_path, filename, base_name_no_ext):
|
||
logging.info(f"⏭️ PDF ya verificado como procesado, omitiendo: {filename}")
|
||
continue
|
||
|
||
process_pdf_main(normalized_path)
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error processing PDF folder for conversion: {e}")
|
||
|
||
# --- PROCESS AUDIOS ---
|
||
try:
|
||
webdav_mkdir(REMOTE_DOCX_AUDIO_FOLDER)
|
||
audio_files = webdav_list(REMOTE_AUDIOS_FOLDER)
|
||
for file_path in audio_files:
|
||
normalized_path = normalize_remote_path(file_path)
|
||
base_name = os.path.basename(normalized_path)
|
||
|
||
if (
|
||
not any(normalized_path.lower().endswith(ext) for ext in AUDIO_EXTENSIONS)
|
||
or normalized_path in processed_files
|
||
or base_name in processed_files
|
||
):
|
||
continue
|
||
|
||
process_audio_file(normalized_path)
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error processing Audio folder: {e}")
|
||
|
||
# --- PROCESS TEXT FILES ---
|
||
try:
|
||
webdav_mkdir(REMOTE_TXT_FOLDER)
|
||
txt_files = webdav_list(REMOTE_TXT_FOLDER)
|
||
for file_path in txt_files:
|
||
normalized_path = normalize_remote_path(file_path)
|
||
base_name = os.path.basename(normalized_path)
|
||
|
||
if (
|
||
not any(normalized_path.lower().endswith(ext) for ext in TXT_EXTENSIONS)
|
||
or normalized_path in processed_files
|
||
or base_name in processed_files
|
||
):
|
||
continue
|
||
|
||
process_txt_file(normalized_path)
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error processing Text folder: {e}")
|
||
|
||
except Exception as cycle_error:
|
||
logging.exception(f"Error inesperado en el ciclo principal: {cycle_error}")
|
||
if should_send_error("main_loop", str(cycle_error)):
|
||
send_telegram_message(f"❌ Error en ciclo principal: {cycle_error}")
|
||
|
||
logging.info(f"--- Cycle completed. Waiting {POLL_INTERVAL} seconds... ---")
|
||
time.sleep(POLL_INTERVAL)
|
||
|
||
except KeyboardInterrupt:
|
||
logging.info("🛑 Interrupción recibida, cerrando servicio")
|
||
finally:
|
||
release_lock(lock_fd)
|
||
|
||
def start_dashboard():
|
||
"""Inicia el dashboard Flask en un hilo separado"""
|
||
try:
|
||
# Importar dashboard aquí para evitar importaciones circulares
|
||
import dashboard
|
||
import threading
|
||
|
||
def run_dashboard():
|
||
"""Función para ejecutar el dashboard en un hilo"""
|
||
logging.info("🚀 Iniciando Dashboard Flask en http://localhost:5000")
|
||
dashboard.app.run(
|
||
host='0.0.0.0',
|
||
port=5000,
|
||
debug=False,
|
||
threaded=True,
|
||
use_reloader=False # Importante: evitar reloading en producción
|
||
)
|
||
|
||
# Crear y iniciar hilo para el dashboard
|
||
dashboard_thread = threading.Thread(target=run_dashboard, daemon=True)
|
||
dashboard_thread.start()
|
||
|
||
logging.info("✅ Dashboard iniciado en hilo separado")
|
||
logging.info("🌐 Accede al dashboard en: http://localhost:5000")
|
||
|
||
return dashboard_thread
|
||
|
||
except Exception as e:
|
||
logging.error(f"❌ Error iniciando dashboard: {e}")
|
||
logging.warning("⚠️ El servicio principal continuará sin dashboard")
|
||
return None
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# Handle command line arguments for specific operations
|
||
if len(sys.argv) > 1:
|
||
command = sys.argv[1]
|
||
|
||
if command == "whisper" and len(sys.argv) == 4:
|
||
# Whisper transcription mode
|
||
transcribe_audio(sys.argv[2], sys.argv[3])
|
||
elif command == "pdf" and len(sys.argv) == 4:
|
||
# PDF processing mode
|
||
process_pdf_file(sys.argv[2], sys.argv[3])
|
||
elif command == "seed-processed":
|
||
snapshot = _snapshot_existing_remote_files()
|
||
current = load_processed_files()
|
||
|
||
entries_to_add = []
|
||
for entry in snapshot:
|
||
normalized = normalize_remote_path(entry)
|
||
base_name = os.path.basename(normalized)
|
||
if normalized in current or base_name in current:
|
||
continue
|
||
entries_to_add.append(normalized)
|
||
|
||
if entries_to_add:
|
||
with open(PROCESSED_FILES_PATH, "a", encoding="utf-8") as f:
|
||
for normalized in sorted(entries_to_add):
|
||
f.write(normalized + "\n")
|
||
print(f"✅ {len(entries_to_add)} entradas añadidas al registro de procesados")
|
||
logging.info(f"Registro de procesados actualizado con {len(entries_to_add)} entradas nuevas")
|
||
else:
|
||
print("ℹ️ No se encontraron archivos adicionales para marcar como procesados")
|
||
sys.exit(0)
|
||
elif command == "txt2docx" and len(sys.argv) == 4:
|
||
# Text to unified DOCX conversion mode
|
||
txt_file = sys.argv[2]
|
||
output_docx = sys.argv[3]
|
||
|
||
if not os.path.exists(txt_file):
|
||
print(f"❌ Text file not found: {txt_file}")
|
||
sys.exit(1)
|
||
|
||
# Extract base name for file generation
|
||
base_name = os.path.splitext(os.path.basename(txt_file))[0]
|
||
|
||
print(f"🤖 Iniciando resumen colaborativo para: {txt_file}")
|
||
|
||
# Generate unified summary
|
||
result = generate_unified_summary(txt_file, base_name)
|
||
|
||
if result and result[0]:
|
||
success, summary_content, output_files = result
|
||
|
||
docx_path = Path(output_files.get('docx_path', ''))
|
||
markdown_path = Path(output_files.get('markdown_path', ''))
|
||
pdf_path_str = output_files.get('pdf_path')
|
||
pdf_path = Path(pdf_path_str) if pdf_path_str else None
|
||
|
||
if not docx_path.exists():
|
||
print("❌ No se generó el DOCX de salida")
|
||
sys.exit(1)
|
||
|
||
if str(docx_path) != output_docx:
|
||
shutil.copy2(docx_path, output_docx)
|
||
|
||
category = classify_content_intelligent(summary_content)
|
||
category_name = TEMATIC_FOLDERS.get(category, TEMATIC_FOLDERS["otras_clases"])
|
||
topics = extract_key_topics_from_text(summary_content)
|
||
topics_str = ' - '.join(topics[:2])
|
||
|
||
print(f"✅ Resumen unificado generado: {output_docx}")
|
||
print(f"🧠 Clasificación automática: {category_name}")
|
||
print(f"🎯 Temas identificados: {topics_str}")
|
||
print(f"📝 Nombre inteligente: {output_files.get('docx_name')}")
|
||
if markdown_path and markdown_path.exists():
|
||
print(f"📄 Markdown: {markdown_path}")
|
||
if pdf_path and pdf_path.exists():
|
||
print(f"📄 PDF: {pdf_path}")
|
||
else:
|
||
print("❌ Failed to generate unified summary")
|
||
sys.exit(1)
|
||
elif command == "quiz" and len(sys.argv) == 4:
|
||
# Quiz generation mode
|
||
input_text = sys.argv[2]
|
||
output_file = sys.argv[3]
|
||
|
||
# If the first argument is a file, read it
|
||
if os.path.isfile(input_text):
|
||
with open(input_text, 'r', encoding='utf-8') as f:
|
||
summary_text = f.read()
|
||
else:
|
||
summary_text = input_text
|
||
|
||
# Generate quiz
|
||
questions, answers = generate_quiz(summary_text)
|
||
|
||
if not questions or not answers:
|
||
print("❌ Could not generate quiz")
|
||
sys.exit(1)
|
||
|
||
# Create document
|
||
doc = Document()
|
||
doc.add_heading('Quiz Generado', level=1)
|
||
|
||
# Add quiz to document
|
||
add_quiz_to_docx(doc, questions, answers)
|
||
|
||
# Save file
|
||
doc.save(output_file)
|
||
print(f"✅ Quiz generated: {output_file}")
|
||
elif command == "dashboard-only":
|
||
# Solo ejecutar el dashboard
|
||
import dashboard
|
||
logging.info("🚀 Iniciando Dashboard Flask únicamente")
|
||
dashboard.app.run(host='0.0.0.0', port=5000, debug=False, threaded=True)
|
||
else:
|
||
print("Usage:")
|
||
print(" python main.py # Run main polling service + dashboard")
|
||
print(" python main.py whisper <audio> <txt> # Transcribe audio")
|
||
print(" python main.py pdf <pdf> <docx> # Process PDF to editable DOCX")
|
||
print(" python main.py seed-processed # Marca archivos actuales como procesados")
|
||
print(" python main.py txt2docx <txt> <docx> # Convert text to summary DOCX")
|
||
print(" python main.py quiz <text> <docx> # Generate quiz from text")
|
||
print(" python main.py dashboard-only # Solo ejecutar dashboard")
|
||
sys.exit(1)
|
||
else:
|
||
# Run main polling service with integrated dashboard
|
||
logging.info("=" * 60)
|
||
logging.info("🚀 INICIANDO SERVICIO COMPLETO")
|
||
logging.info("=" * 60)
|
||
|
||
# Iniciar dashboard en hilo separado
|
||
dashboard_thread = start_dashboard()
|
||
|
||
# Pequeña pausa para que el dashboard se inicie
|
||
time.sleep(2)
|
||
|
||
# Ejecutar servicio principal
|
||
main()
|