feat: Sistema CBCFacil completo con cola secuencial

- Implementa ProcessingMonitor singleton para procesamiento secuencial de archivos - Agrega AI summary service con soporte para MiniMax API - Agrega PDF generator para resúmenes - Agrega watchers para monitoreo de carpeta remota - Mejora sistema de notificaciones Telegram - Implementa gestión de VRAM para GPU - Configuración mediante variables de entorno (sin hardcoded secrets) - .env y transcriptions/ agregados a .gitignore Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 15:35:39 +00:00
parent dcf887c510
commit ee8fc183be
77 changed files with 3734 additions and 20263 deletions
--- a/processors/init.py
+++ b/processors/init.py
@@ -1,15 +1,5 @@
-"""
-Processors package for CBCFacil
-"""
+"""Procesadores de documentos y medios."""

-from .base_processor import FileProcessor
-from .audio_processor import AudioProcessor
-from .pdf_processor import PDFProcessor
-from .text_processor import TextProcessor
+from processors.audio_processor import AudioProcessor, AudioProcessingError

-__all__ = [
-    'FileProcessor',
-    'AudioProcessor',
-    'PDFProcessor',
-    'TextProcessor'
-]
+__all__ = ["AudioProcessor", "AudioProcessingError"]
--- a/processors/audio_processor.py
+++ b/processors/audio_processor.py
@@ -1,93 +1,467 @@
 """
-Audio file processor using Whisper
+Procesador de audio para transcripción con Whisper.
+
+OPTIMIZACIONES DE MEMORIA PARA GPUs DE 8GB:
+    - Cache global singleton para evitar carga múltiple del modelo
+    - Configuración PYTORCH_ALLOC_CONF para reducir fragmentación
+    - Verificación de VRAM antes de cargar
+    - Fallback automático a CPU si GPU OOM
+    - Limpieza agresiva de cache CUDA
 """
+import gc
 import logging
+import os
+import subprocess
+import tempfile
+import time
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
 from pathlib import Path
-from typing import Dict, Any
-from core import FileProcessingError
+from typing import Dict, Literal, Optional, Tuple
+
+import whisper
+
 from config import settings
-from services import vram_manager
-from services.gpu_detector import gpu_detector
-from .base_processor import FileProcessor
-
-try:
-    import whisper
-    import torch
-    WHISPER_AVAILABLE = True
-except ImportError:
-    WHISPER_AVAILABLE = False
+from services.vram_manager import vram_manager


-class AudioProcessor(FileProcessor):
-    """Processor for audio files using Whisper"""
+logger = logging.getLogger(__name__)

-    def __init__(self):
-        super().__init__("AudioProcessor")
-        self.logger = logging.getLogger(__name__)
-        self._model = None
-        self._model_name = "medium"  # Optimized for Spanish

-    def can_process(self, file_path: str) -> bool:
-        """Check if file is an audio file"""
-        ext = self.get_file_extension(file_path)
-        return ext in settings.AUDIO_EXTENSIONS
+# ============ CONFIGURACIÓN DE OPTIMIZACIONES ============

-    def _load_model(self):
-        """Load Whisper model lazily"""
-        if not WHISPER_AVAILABLE:
-            raise FileProcessingError("Whisper not installed")
+# CRÍTICO: Permite segmentos expandibles para reducir fragmentación
+os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

-        if self._model is None:
-            device = gpu_detector.get_device()
-            self.logger.info(f"Loading Whisper model: {self._model_name} on {device}")
-            self._model = whisper.load_model(self._model_name, device=device)
-            vram_manager.update_usage()
+# Tamaños de modelos en GB (incluyendo overhead)
+MODEL_MEMORY_REQUIREMENTS = {
+    "tiny": 0.5, "base": 0.8, "small": 1.5,
+    "medium": 2.5, "large": 4.5,
+}

-    def process(self, file_path: str) -> Dict[str, Any]:
-        """Transcribe audio file"""
-        self.validate_file(file_path)
-        audio_path = Path(file_path)
-        output_path = settings.LOCAL_DOWNLOADS_PATH / f"{audio_path.stem}.txt"
+# Cache global singleton - CLAVE para evitar OOM
+_model_cache: Dict[str, Tuple[whisper.Whisper, str, float]] = {}

-        self.logger.info(f"Processing audio file: {audio_path}")
+TRANSCRIPTION_TIMEOUT_SECONDS = 600
+MAX_RETRY_ATTEMPTS = 2
+RETRY_DELAY_SECONDS = 5
+
+
+# ============ FUNCIONES DE GESTIÓN DE MEMORIA ============
+
+def get_gpu_memory_info() -> Dict[str, float]:
+    """Obtiene información de memoria GPU en GB."""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            props = torch.cuda.get_device_properties(0)
+            total = props.total_memory / (1024 ** 3)
+            reserved = torch.cuda.memory_reserved(0) / (1024 ** 3)
+            allocated = torch.cuda.memory_allocated(0) / (1024 ** 3)
+            return {"total": total, "free": total - reserved, "used": allocated, "reserved": reserved}
+    except Exception:
+        pass
+    return {"total": 0, "free": 0, "used": 0, "reserved": 0}
+
+
+def clear_cuda_cache(aggressive: bool = False) -> None:
+    """Limpia el cache de CUDA."""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            if aggressive:
+                for _ in range(3):
+                    gc.collect()
+                    torch.cuda.empty_cache()
+    except Exception:
+        pass
+
+
+def check_memory_for_model(model_name: str) -> Tuple[bool, str]:
+    """Verifica si hay memoria suficiente para el modelo."""
+    required = MODEL_MEMORY_REQUIREMENTS.get(model_name, 2.0)
+    gpu_info = get_gpu_memory_info()
+    
+    if gpu_info["total"] == 0:
+        return False, "cpu"
+    
+    needed = required * 1.5
+    if gpu_info["free"] >= needed:
+        return True, "cuda"
+    elif gpu_info["free"] >= required:
+        return True, "cuda"
+    else:
+        logger.warning(f"Memoria GPU insuficiente para '{model_name}': {gpu_info['free']:.2f}GB libre, {required:.2f}GB necesario")
+        return False, "cpu"
+
+
+def get_cached_model(model_name: str, device: str) -> Optional[whisper.Whisper]:
+    """Obtiene modelo desde cache global."""
+    cache_key = f"{model_name}_{device}"
+    if cache_key in _model_cache:
+        model, cached_device, _ = _model_cache[cache_key]
+        if cached_device == device:
+            logger.info(f"Modelo '{model_name}' desde cache global")
+            _model_cache[cache_key] = (model, cached_device, time.time())
+            return model
+        del _model_cache[cache_key]
+    return None
+
+
+def cache_model(model_name: str, model: whisper.Whisper, device: str) -> None:
+    """Almacena modelo en cache global."""
+    cache_key = f"{model_name}_{device}"
+    _model_cache[cache_key] = (model, device, time.time())
+    logger.info(f"Modelo '{model_name}' cacheado en {device}")
+
+
+def clear_model_cache() -> None:
+    """Limpia todo el cache de modelos."""
+    global _model_cache
+    for cache_key, (model, _, _) in list(_model_cache.items()):
+        try:
+            del model
+        except Exception:
+            pass
+    _model_cache.clear()
+    clear_cuda_cache(aggressive=True)
+
+
+# ============ EXCEPCIONES ============
+
+class AudioProcessingError(Exception):
+    """Error específico para fallos en el procesamiento de audio."""
+    pass
+
+
+class TranscriptionTimeoutError(AudioProcessingError):
+    """Error cuando la transcripción excede el tiempo máximo."""
+    pass
+
+
+class GPUOutOfMemoryError(AudioProcessingError):
+    """Error específico para CUDA OOM."""
+    pass
+
+
+class AudioValidationError(AudioProcessingError):
+    """Error cuando el archivo de audio no pasa las validaciones."""
+    pass
+
+
+# ============ PROCESADOR DE AUDIO ============
+
+class AudioProcessor:
+    """Procesador de audio con cache global y fallback automático."""
+
+    SUPPORTED_MODELS = ("tiny", "base", "small", "medium", "large")
+    DEFAULT_MODEL = settings.WHISPER_MODEL
+    DEFAULT_LANGUAGE = "es"
+
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        language: Optional[str] = None,
+        device: Optional[Literal["cuda", "rocm", "cpu", "auto"]] = None,
+    ) -> None:
+        self._model_name = model_name or settings.WHISPER_MODEL
+        self._language = language or self.DEFAULT_LANGUAGE
+        self._device = device or "auto"
+        self._model: Optional[whisper.Whisper] = None
+        self._using_cpu_fallback = False
+        self._model_id = f"whisper_{self._model_name}"
+
+        if self._model_name not in self.SUPPORTED_MODELS:
+            raise ValueError(
+                f"Modelo '{self._model_name}' no soportado. "
+                f"Disponibles: {', '.join(self.SUPPORTED_MODELS)}"
+            )
+
+        logger.info(
+            "AudioProcessor inicializado",
+            extra={"model": self._model_name, "device": self._device},
+        )
+
+    @property
+    def model_name(self) -> str:
+        return self._model_name
+
+    @property
+    def language(self) -> str:
+        return self._language
+
+    @property
+    def device(self) -> str:
+        return getattr(self, "_resolved_device", self._device)
+
+    @property
+    def is_loaded(self) -> bool:
+        return self._model is not None
+
+    def _validate_audio_file(self, audio_path: Path) -> dict:
+        """Valida el archivo de audio."""
+        logger.info(f"Validando: {audio_path.name}")
+        
+        file_size = audio_path.stat().st_size
+        if file_size < 1024:
+            raise AudioValidationError("Archivo demasiado pequeño")
+        if file_size > 500 * 1024 * 1024:
+            logger.warning(f"Archivo grande: {file_size / 1024 / 1024:.1f}MB")

        try:
-            # Load model if needed
-            self._load_model()
+            cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+                   "-show_entries", "stream=channels,sample_rate,codec_name",
+                   "-of", "json", str(audio_path)]
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+            
+            if result.returncode == 0:
+                import json
+                info = json.loads(result.stdout)
+                duration = float(info.get("format", {}).get("duration", 0))
+                
+                for stream in info.get("streams", []):
+                    if stream.get("codec_type") == "audio":
+                        return {
+                            "duration": duration,
+                            "sample_rate": int(stream.get("sample_rate", 16000)),
+                            "channels": int(stream.get("channels", 1)),
+                            "codec": stream.get("codec_name", "unknown"),
+                            "size_bytes": file_size,
+                        }
+        except Exception:
+            pass

-            # Update VRAM usage
-            vram_manager.update_usage()
+        return {"duration": 0, "sample_rate": 16000, "channels": 1,
+                "codec": "unknown", "size_bytes": file_size}

-            # Transcribe with torch.no_grad() for memory efficiency
-            with torch.inference_mode():
-                result = self._model.transcribe(
-                    str(audio_path),
-                    language="es",
-                    fp16=True,
-                    verbose=False
+    def _convert_audio_with_ffmpeg(self, input_path: Path, output_format: str = "wav") -> Path:
+        """Convierte audio usando ffmpeg."""
+        suffix = f".{output_format}"
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+            output_path = Path(tmp.name)
+
+        cmd = ["ffmpeg", "-i", str(input_path),
+               "-acodec", "pcm_s16le" if output_format == "wav" else "libmp3lame",
+               "-ar", "16000", "-ac", "1", "-y", str(output_path)]
+
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+        
+        if result.returncode != 0 or not output_path.exists():
+            raise AudioProcessingError(f"ffmpeg falló: {result.stderr[-500:] if result.stderr else 'Unknown'}")
+
+        return output_path
+
+    def _get_device_with_memory_check(self) -> str:
+        """Detecta dispositivo verificando memoria disponible."""
+        if self._device == "cpu":
+            return "cpu"
+
+        if self._device == "auto":
+            has_memory, recommended = check_memory_for_model(self._model_name)
+            
+            if has_memory and recommended == "cuda":
+                try:
+                    import torch
+                    if torch.cuda.is_available():
+                        logger.info(f"GPU detectada: {torch.cuda.get_device_name(0)}")
+                        return "cuda"
+                except ImportError:
+                    pass
+            
+            if not has_memory:
+                logger.warning("Usando CPU por falta de memoria GPU")
+                self._using_cpu_fallback = True
+            return "cpu"
+
+        return self._device
+
+    def _load_model(self, force_reload: bool = False) -> None:
+        """Carga modelo usando cache global con optimizaciones de memoria."""
+        if self._model is not None and not force_reload:
+            return
+
+        # Configurar PyTorch para mejor gestión de memoria
+        import os
+        os.environ['PYTORCH_ALLOC_CONF'] = 'expandable_segments:True'
+
+        clear_cuda_cache(aggressive=True)
+        self._resolved_device = self._get_device_with_memory_check()
+
+        # Verificar cache global
+        if not force_reload:
+            cached = get_cached_model(self._model_name, self._resolved_device)
+            if cached is not None:
+                self._model = cached
+                return
+
+        try:
+            # Cargar modelo con menos memoria inicial
+            # Primero cargar en RAM, luego mover a GPU
+            import torch
+            with torch.cuda.device(self._resolved_device):
+                self._model = whisper.load_model(
+                    self._model_name,
+                    device=self._resolved_device,
+                    download_root=None,
+                    in_memory=True  # Reducir uso de disco
                )

-            # Save transcription
-            output_path.parent.mkdir(parents=True, exist_ok=True)
-            with open(output_path, 'w', encoding='utf-8') as f:
-                f.write(result["text"])
+            # Limpiar cache después de cargar
+            torch.cuda.empty_cache()

-            self.logger.info(f"Transcription completed: {output_path}")
+            cache_model(self._model_name, self._model, self._resolved_device)

-            return {
-                "success": True,
-                "transcription_path": str(output_path),
-                "text": result["text"],
-                "model_used": self._model_name
-            }
+            gpu_info = get_gpu_memory_info()
+            logger.info(
+                f"Modelo cargado en {self._resolved_device}",
+                extra={"gpu_used_gb": round(gpu_info.get("used", 0), 2),
+                       "gpu_free_gb": round(gpu_info.get("free", 0), 2)},
+            )
+            vram_manager.update_usage(self._model_id)

+        except RuntimeError as e:
+            error_str = str(e)
+            if "out of memory" in error_str.lower():
+                # NUNCA usar CPU - limpiar GPU y reintentar
+                logger.error(f"OOM en GPU - limpiando memoria para reintentar...")
+                clear_cuda_cache(aggressive=True)
+                raise AudioProcessingError(f"CUDA OOM - limpie la GPU y reintente. {error_str}") from e
+            else:
+                raise AudioProcessingError(f"Error cargando modelo: {e}") from e
        except Exception as e:
-            self.logger.error(f"Audio processing failed: {e}")
-            raise FileProcessingError(f"Audio processing failed: {e}")
+            raise AudioProcessingError(f"Error cargando modelo: {e}") from e

-    def cleanup(self) -> None:
-        """Cleanup model"""
+    def _transcribe_internal(self, audio_path: Path, audio_properties: dict) -> str:
+        """Ejecuta la transcripción real."""
+        result = self._model.transcribe(
+            str(audio_path),
+            language=self._language,
+            fp16=self._resolved_device in ("cuda", "rocm"),
+            verbose=False,
+        )
+        return result.get("text", "").strip()
+
+    def transcribe(self, audio_path: str) -> str:
+        """Transcribe un archivo de audio."""
+        audio_file = Path(audio_path)
+        if not audio_file.exists():
+            raise FileNotFoundError(f"Archivo no encontrado: {audio_path}")
+
+        vram_manager.update_usage(self._model_id)
+
+        try:
+            audio_properties = self._validate_audio_file(audio_file)
+        except AudioValidationError as e:
+            logger.error(f"Validación falló: {e}")
+            raise
+
+        converted_file: Optional[Path] = None
+        last_error: Optional[Exception] = None
+
+        for attempt in range(MAX_RETRY_ATTEMPTS):
+            try:
+                force_reload = attempt > 0
+                if self._model is None or force_reload:
+                    self._load_model(force_reload=force_reload)
+
+                audio_to_transcribe = audio_file
+                cleanup_converted = False
+
+                needs_conversion = (
+                    audio_file.suffix.lower() not in {".wav", ".mp3"} or
+                    audio_properties.get("codec") in ("aac", "opus", "vorbis") or
+                    audio_properties.get("channels", 1) > 1
+                )
+
+                if needs_conversion:
+                    try:
+                        converted_file = self._convert_audio_with_ffmpeg(audio_file, "wav")
+                        audio_to_transcribe = converted_file
+                        cleanup_converted = True
+                    except AudioProcessingError as e:
+                        logger.warning(f"Conversión falló: {e}")
+
+                logger.info(
+                    f"Transcribiendo: {audio_file.name}",
+                    extra={"device": self._resolved_device, "cpu_fallback": self._using_cpu_fallback},
+                )
+
+                with ThreadPoolExecutor(max_workers=1) as executor:
+                    future = executor.submit(self._transcribe_internal, audio_to_transcribe, audio_properties)
+                    try:
+                        text = future.result(timeout=TRANSCRIPTION_TIMEOUT_SECONDS)
+                    except FutureTimeoutError:
+                        self.unload()
+                        raise TranscriptionTimeoutError(f"Timeout después de {TRANSCRIPTION_TIMEOUT_SECONDS}s")
+
+                logger.info(f"Transcripción completada: {len(text)} caracteres")
+                return text
+
+            except RuntimeError as e:
+                error_str = str(e)
+                last_error = e
+
+                if "out of memory" in error_str.lower():
+                    logger.warning("OOM durante transcripción...")
+                    clear_cuda_cache(aggressive=True)
+
+                    if not self._using_cpu_fallback and self._resolved_device in ("cuda", "rocm"):
+                        self.unload()
+                        self._resolved_device = "cpu"
+                        self._using_cpu_fallback = True
+                        self._load_model()
+                        continue
+
+                    if attempt >= MAX_RETRY_ATTEMPTS - 1:
+                        raise GPUOutOfMemoryError("Memoria GPU insuficiente") from e
+                    time.sleep(RETRY_DELAY_SECONDS)
+                    continue
+
+                if "Key and Value must have the same sequence length" in error_str:
+                    if not converted_file:
+                        converted_file = self._convert_audio_with_ffmpeg(audio_file, "wav")
+                        text = self._model.transcribe(
+                            str(converted_file), language=self._language,
+                            fp16=self._resolved_device in ("cuda", "rocm"), verbose=False
+                        ).get("text", "").strip()
+                        converted_file.unlink()
+                        return text
+
+                raise AudioProcessingError(f"Error de transcripción: {e}") from e
+
+            except (TranscriptionTimeoutError, GPUOutOfMemoryError):
+                raise
+            except Exception as e:
+                last_error = e
+                self.unload()
+
+                if attempt >= MAX_RETRY_ATTEMPTS - 1:
+                    raise AudioProcessingError(f"Error después de {MAX_RETRY_ATTEMPTS} intentos: {e}") from e
+
+                time.sleep(RETRY_DELAY_SECONDS)
+
+            finally:
+                if converted_file and converted_file.exists():
+                    try:
+                        converted_file.unlink()
+                    except Exception:
+                        pass
+
+        raise AudioProcessingError(f"Error al transcribir: {last_error}") from last_error
+
+    def unload(self) -> None:
+        """Descarga la referencia local del modelo."""
        if self._model is not None:
-            del self._model
            self._model = None
-            vram_manager.cleanup()
+            clear_cuda_cache(aggressive=False)
+            vram_manager.unregister_model(self._model_id)
+
+    def __repr__(self) -> str:
+        return f"AudioProcessor(model='{self._model_name}', device='{self.device}', loaded={self.is_loaded})"
+
+    def __del__(self) -> None:
+        try:
+            self.unload()
+        except Exception:
+            pass
--- a/processors/base_processor.py
+++ b/processors/base_processor.py
@@ -1,40 +0,0 @@
-"""
-Base File Processor (Strategy Pattern)
-"""
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import Dict, Any, Optional
-from core import FileProcessingError
-
-
-class FileProcessor(ABC):
-    """Abstract base class for file processors"""
-
-    def __init__(self, name: str):
-        self.name = name
-
-    @abstractmethod
-    def can_process(self, file_path: str) -> bool:
-        """Check if processor can handle this file type"""
-        pass
-
-    @abstractmethod
-    def process(self, file_path: str) -> Dict[str, Any]:
-        """Process the file"""
-        pass
-
-    def get_file_extension(self, file_path: str) -> str:
-        """Get file extension from path"""
-        return Path(file_path).suffix.lower()
-
-    def get_base_name(self, file_path: str) -> str:
-        """Get base name without extension"""
-        return Path(file_path).stem
-
-    def validate_file(self, file_path: str) -> None:
-        """Validate file exists and is accessible"""
-        path = Path(file_path)
-        if not path.exists():
-            raise FileProcessingError(f"File not found: {file_path}")
-        if not path.is_file():
-            raise FileProcessingError(f"Path is not a file: {file_path}")
--- a/processors/pdf_processor.py
+++ b/processors/pdf_processor.py
@@ -1,164 +0,0 @@
-"""
-PDF file processor with OCR
-"""
-import logging
-from pathlib import Path
-from typing import Dict, Any
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from core import FileProcessingError
-from config import settings
-from services import vram_manager
-from services.gpu_detector import gpu_detector
-from .base_processor import FileProcessor
-
-try:
-    import torch
-    import pytesseract
-    import easyocr
-    import cv2
-    import numpy as np
-    from pdf2image import convert_from_path
-    from PIL import Image
-    PDF_OCR_AVAILABLE = True
-except ImportError:
-    PDF_OCR_AVAILABLE = False
-    # Provide stub for type hints
-    try:
-        from PIL import Image
-    except ImportError:
-        Image = None  # type: ignore
-
-
-class PDFProcessor(FileProcessor):
-    """Processor for PDF files with OCR"""
-
-    def __init__(self):
-        super().__init__("PDFProcessor")
-        self.logger = logging.getLogger(__name__)
-        self._easyocr_reader = None
-
-    def can_process(self, file_path: str) -> bool:
-        """Check if file is a PDF"""
-        return self.get_file_extension(file_path) == ".pdf"
-
-    def _load_easyocr(self):
-        """Load EasyOCR reader"""
-        if self._easyocr_reader is None:
-            use_gpu = gpu_detector.is_available()
-            self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})")
-            self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu)
-            vram_manager.update_usage()
-
-    def _preprocess_image(self, image: Image.Image) -> Image.Image:
-        """Preprocess image for better OCR"""
-        # Convert to grayscale
-        if image.mode != 'L':
-            image = image.convert('L')
-
-        # Simple preprocessing
-        image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS)
-
-        return image
-
-    def _run_ocr_parallel(self, pil_images) -> Dict[str, list]:
-        """Run all OCR engines in parallel"""
-        results = {
-            'easyocr': [''] * len(pil_images),
-            'tesseract': [''] * len(pil_images)
-        }
-
-        with ThreadPoolExecutor(max_workers=2) as executor:
-            futures = {}
-
-            # EasyOCR
-            if self._easyocr_reader:
-                futures['easyocr'] = executor.submit(
-                    self._easyocr_reader.readtext_batched,
-                    pil_images,
-                    detail=0
-                )
-
-            # Tesseract
-            futures['tesseract'] = executor.submit(
-                lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs],
-                pil_images
-            )
-
-            # Collect results
-            for name, future in futures.items():
-                try:
-                    results[name] = future.result()
-                except Exception as e:
-                    self.logger.error(f"OCR engine {name} failed: {e}")
-                    results[name] = [''] * len(pil_images)
-
-        return results
-
-    def process(self, file_path: str) -> Dict[str, Any]:
-        """Process PDF with OCR"""
-        self.validate_file(file_path)
-        pdf_path = Path(file_path)
-        output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt"
-
-        if not PDF_OCR_AVAILABLE:
-            raise FileProcessingError("PDF OCR dependencies not installed")
-
-        self.logger.info(f"Processing PDF file: {pdf_path}")
-
-        try:
-            # Load EasyOCR if needed
-            self._load_easyocr()
-            vram_manager.update_usage()
-
-            # Convert PDF to images
-            self.logger.debug("Converting PDF to images")
-            pil_images = convert_from_path(
-                str(pdf_path),
-                dpi=settings.PDF_DPI,
-                fmt='png',
-                thread_count=settings.PDF_RENDER_THREAD_COUNT
-            )
-
-            # Process in batches
-            all_text = []
-            batch_size = settings.PDF_BATCH_SIZE
-
-            for i in range(0, len(pil_images), batch_size):
-                batch = pil_images[i:i + batch_size]
-                self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}")
-
-                # Preprocess images
-                preprocessed_batch = [self._preprocess_image(img) for img in batch]
-
-                # Run OCR in parallel
-                ocr_results = self._run_ocr_parallel(preprocessed_batch)
-
-                # Combine results
-                for j, img in enumerate(batch):
-                    # Take best result (simple approach: try EasyOCR first, then Tesseract)
-                    text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j]
-                    if text:
-                        all_text.append(text)
-
-            # Save combined text
-            output_path.parent.mkdir(parents=True, exist_ok=True)
-            with open(output_path, 'w', encoding='utf-8') as f:
-                f.write("\n\n".join(all_text))
-
-            self.logger.info(f"PDF processing completed: {output_path}")
-
-            return {
-                "success": True,
-                "text_path": str(output_path),
-                "text": "\n\n".join(all_text),
-                "pages_processed": len(pil_images)
-            }
-
-        except Exception as e:
-            self.logger.error(f"PDF processing failed: {e}")
-            raise FileProcessingError(f"PDF processing failed: {e}")
-
-    def cleanup(self) -> None:
-        """Cleanup OCR models"""
-        self._easyocr_reader = None
-        vram_manager.cleanup()
--- a/processors/text_processor.py
+++ b/processors/text_processor.py
@@ -1,55 +0,0 @@
-"""
-Text file processor
-"""
-import logging
-from pathlib import Path
-from typing import Dict, Any
-from core import FileProcessingError
-from config import settings
-from .base_processor import FileProcessor
-
-
-class TextProcessor(FileProcessor):
-    """Processor for text files"""
-
-    def __init__(self):
-        super().__init__("TextProcessor")
-        self.logger = logging.getLogger(__name__)
-
-    def can_process(self, file_path: str) -> bool:
-        """Check if file is a text file"""
-        ext = self.get_file_extension(file_path)
-        return ext in settings.TXT_EXTENSIONS
-
-    def process(self, file_path: str) -> Dict[str, Any]:
-        """Process text file (copy to downloads)"""
-        self.validate_file(file_path)
-        text_path = Path(file_path)
-        output_path = settings.LOCAL_DOWNLOADS_PATH / text_path.name
-
-        self.logger.info(f"Processing text file: {text_path}")
-
-        try:
-            # Copy file to downloads directory
-            output_path.parent.mkdir(parents=True, exist_ok=True)
-
-            with open(text_path, 'r', encoding='utf-8') as src:
-                with open(output_path, 'w', encoding='utf-8') as dst:
-                    dst.write(src.read())
-
-            self.logger.info(f"Text file processing completed: {output_path}")
-
-            return {
-                "success": True,
-                "text_path": str(output_path),
-                "text": self._read_file(output_path)
-            }
-
-        except Exception as e:
-            self.logger.error(f"Text processing failed: {e}")
-            raise FileProcessingError(f"Text processing failed: {e}")
-
-    def _read_file(self, file_path: Path) -> str:
-        """Read file content"""
-        with open(file_path, 'r', encoding='utf-8') as f:
-            return f.read()