- Corrige PDFGenerator para pasar contenido (no ruta) - Agrega prompt siguiendo código.md (español, estructura académica) - Limpia thinking tokens de respuesta AI - Agrega skip de archivos ya procesados en watcher - Implementa tablas LaTeX en PDFs (reportlab Table) - Agrega load_dotenv() en main.py - Actualiza .env con MiniMax config - Agrega transcriptions/ a .gitignore Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
239 lines
7.6 KiB
Python
239 lines
7.6 KiB
Python
"""
|
|
Watcher de carpeta local.
|
|
Monitorea una carpeta por archivos nuevos y los procesa.
|
|
"""
|
|
import logging
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Callable, Optional
|
|
from watchdog.observers import Observer
|
|
from watchdog.events import FileSystemEventHandler, FileCreatedEvent
|
|
|
|
from config import settings
|
|
from services import WebDAVService
|
|
|
|
|
|
class FileHandler(FileSystemEventHandler):
|
|
"""Manejador de eventos del sistema de archivos."""
|
|
|
|
def __init__(
|
|
self,
|
|
on_new_file: Callable[[Path], None],
|
|
logger: Optional[logging.Logger] = None,
|
|
) -> None:
|
|
super().__init__()
|
|
self.on_new_file = on_new_file
|
|
self.logger = logger or logging.getLogger(__name__)
|
|
|
|
def on_created(self, event: FileCreatedEvent) -> None:
|
|
"""Se llama cuando se crea un nuevo archivo."""
|
|
if event.is_directory:
|
|
return
|
|
|
|
file_path = Path(event.src_path)
|
|
self.logger.info(f"New file detected: {file_path}")
|
|
|
|
# Ignorar archivos temporales
|
|
if file_path.suffix in [".tmp", ".part", ".crdownload"]:
|
|
return
|
|
|
|
# Ignorar archivos ocultos
|
|
if file_path.name.startswith("."):
|
|
return
|
|
|
|
# Esperar a que el archivo esté listo
|
|
time.sleep(1)
|
|
|
|
try:
|
|
self.on_new_file(file_path)
|
|
except Exception as e:
|
|
self.logger.error(f"Error processing file {file_path}: {e}")
|
|
|
|
|
|
class FolderWatcher:
|
|
"""Monitor de carpeta local."""
|
|
|
|
def __init__(
|
|
self,
|
|
watch_path: Optional[Path] = None,
|
|
on_new_file: Optional[Callable[[Path], None]] = None,
|
|
) -> None:
|
|
self.logger = logging.getLogger(__name__)
|
|
self.watch_path = watch_path or settings.DOWNLOADS_DIR
|
|
self.on_new_file_callback = on_new_file
|
|
self._observer: Optional[Observer] = None
|
|
self._running = False
|
|
self._processed_files: set[str] = set()
|
|
|
|
# Asegurar que la carpeta existe
|
|
self.watch_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
def set_callback(self, callback: Callable[[Path], None]) -> None:
|
|
"""Establece el callback para nuevos archivos."""
|
|
self.on_new_file_callback = callback
|
|
|
|
def start(self) -> None:
|
|
"""Inicia el watcher."""
|
|
if self._running:
|
|
self.logger.warning("Watcher already running")
|
|
return
|
|
|
|
self.logger.info(f"Starting folder watcher on: {self.watch_path}")
|
|
|
|
event_handler = FileHandler(
|
|
on_new_file=self._handle_new_file,
|
|
logger=self.logger,
|
|
)
|
|
|
|
self._observer = Observer()
|
|
self._observer.schedule(event_handler, str(self.watch_path), recursive=False)
|
|
self._observer.start()
|
|
self._running = True
|
|
|
|
self.logger.info("Folder watcher started")
|
|
|
|
def stop(self) -> None:
|
|
"""Detiene el watcher."""
|
|
if not self._running:
|
|
return
|
|
|
|
self.logger.info("Stopping folder watcher")
|
|
if self._observer:
|
|
self._observer.stop()
|
|
self._observer.join()
|
|
self._observer = None
|
|
|
|
self._running = False
|
|
self.logger.info("Folder watcher stopped")
|
|
|
|
def _handle_new_file(self, file_path: Path) -> None:
|
|
"""Maneja un nuevo archivo detectado."""
|
|
file_key = str(file_path)
|
|
|
|
# Evitar procesar el mismo archivo dos veces
|
|
if file_key in self._processed_files:
|
|
return
|
|
|
|
self._processed_files.add(file_key)
|
|
self.logger.info(f"Processing new file: {file_path}")
|
|
|
|
if self.on_new_file_callback:
|
|
self.on_new_file_callback(file_path)
|
|
|
|
def get_status(self) -> dict:
|
|
"""Obtiene el estado del watcher."""
|
|
return {
|
|
"running": self._running,
|
|
"watch_path": str(self.watch_path),
|
|
"processed_files_count": len(self._processed_files),
|
|
}
|
|
|
|
|
|
class RemoteFolderWatcher:
|
|
"""Watcher que descarga archivos desde Nextcloud."""
|
|
|
|
def __init__(
|
|
self,
|
|
webdav_service: WebDAVService,
|
|
local_path: Optional[Path] = None,
|
|
remote_path: Optional[str] = None,
|
|
) -> None:
|
|
self.logger = logging.getLogger(__name__)
|
|
self.webdav = webdav_service
|
|
self.local_path = local_path or settings.DOWNLOADS_DIR
|
|
self.remote_path = remote_path or settings.WATCHED_REMOTE_PATH
|
|
self._running = False
|
|
self._last_checked_files: set[str] = set()
|
|
self._on_download: Optional[Callable[[Path], None]] = None
|
|
|
|
# Asegurar que la carpeta local existe
|
|
self.local_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
def set_callback(self, callback: Callable[[Path], None]) -> None:
|
|
"""Establece el callback para archivos descargados."""
|
|
self._on_download = callback
|
|
|
|
def start(self) -> None:
|
|
"""Inicia el polling de archivos remotos."""
|
|
if self._running:
|
|
self.logger.warning("Remote watcher already running")
|
|
return
|
|
|
|
self._running = True
|
|
self.logger.info(f"Starting remote folder watcher: {self.remote_path} -> {self.local_path}")
|
|
|
|
# Primer escaneo
|
|
self._check_for_new_files()
|
|
|
|
def stop(self) -> None:
|
|
"""Detiene el watcher."""
|
|
self._running = False
|
|
self.logger.info("Remote folder watcher stopped")
|
|
|
|
def check_now(self) -> None:
|
|
"""Fuerza una verificación inmediata."""
|
|
self._check_for_new_files()
|
|
|
|
def _check_for_new_files(self) -> None:
|
|
"""Verifica si hay nuevos archivos en Nextcloud."""
|
|
if not self._running:
|
|
return
|
|
|
|
try:
|
|
files = self.webdav.list_files(self.remote_path)
|
|
current_files = set(files) - self._last_checked_files
|
|
|
|
if current_files:
|
|
self.logger.info(f"Found {len(current_files)} new files")
|
|
|
|
for filename in current_files:
|
|
if filename.strip(): # Ignorar nombres vacíos
|
|
self._download_file(filename)
|
|
|
|
self._last_checked_files = set(files)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error checking remote files: {e}")
|
|
|
|
def _download_file(self, filename: str) -> None:
|
|
"""Descarga un archivo individual."""
|
|
remote_path = f"{self.remote_path}/{filename}"
|
|
local_path = self.local_path / filename
|
|
|
|
# Verificar si ya existe el archivo localmente
|
|
if local_path.exists():
|
|
# Verificar si ya fue procesado (existe transcripción)
|
|
stem = local_path.stem
|
|
transcriptions_dir = self.local_path.parent / "transcriptions"
|
|
processed = False
|
|
|
|
if transcriptions_dir.exists():
|
|
for f in transcriptions_dir.iterdir():
|
|
if f.suffix == ".txt" and stem in f.stem:
|
|
processed = True
|
|
break
|
|
|
|
if processed:
|
|
self.logger.info(f"Skipping already processed file: {filename}")
|
|
return
|
|
else:
|
|
# Existe pero no procesado, re-descargar
|
|
self.logger.info(f"Re-downloading incomplete file: {filename}")
|
|
|
|
self.logger.info(f"Downloading: {remote_path}")
|
|
|
|
if self.webdav.download_file(remote_path, local_path):
|
|
if self._on_download:
|
|
self._on_download(local_path)
|
|
else:
|
|
self.logger.error(f"Failed to download: {filename}")
|
|
|
|
def get_status(self) -> dict:
|
|
"""Obtiene el estado del watcher."""
|
|
return {
|
|
"running": self._running,
|
|
"remote_path": self.remote_path,
|
|
"local_path": str(self.local_path),
|
|
"last_checked_files": len(self._last_checked_files),
|
|
}
|