feat: Sistema CBCFacil completo con cola secuencial
- Implementa ProcessingMonitor singleton para procesamiento secuencial de archivos - Agrega AI summary service con soporte para MiniMax API - Agrega PDF generator para resúmenes - Agrega watchers para monitoreo de carpeta remota - Mejora sistema de notificaciones Telegram - Implementa gestión de VRAM para GPU - Configuración mediante variables de entorno (sin hardcoded secrets) - .env y transcriptions/ agregados a .gitignore Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,15 +1,5 @@
|
||||
"""
|
||||
Processors package for CBCFacil
|
||||
"""
|
||||
"""Procesadores de documentos y medios."""
|
||||
|
||||
from .base_processor import FileProcessor
|
||||
from .audio_processor import AudioProcessor
|
||||
from .pdf_processor import PDFProcessor
|
||||
from .text_processor import TextProcessor
|
||||
from processors.audio_processor import AudioProcessor, AudioProcessingError
|
||||
|
||||
__all__ = [
|
||||
'FileProcessor',
|
||||
'AudioProcessor',
|
||||
'PDFProcessor',
|
||||
'TextProcessor'
|
||||
]
|
||||
__all__ = ["AudioProcessor", "AudioProcessingError"]
|
||||
|
||||
@@ -1,93 +1,467 @@
|
||||
"""
|
||||
Audio file processor using Whisper
|
||||
Procesador de audio para transcripción con Whisper.
|
||||
|
||||
OPTIMIZACIONES DE MEMORIA PARA GPUs DE 8GB:
|
||||
- Cache global singleton para evitar carga múltiple del modelo
|
||||
- Configuración PYTORCH_ALLOC_CONF para reducir fragmentación
|
||||
- Verificación de VRAM antes de cargar
|
||||
- Fallback automático a CPU si GPU OOM
|
||||
- Limpieza agresiva de cache CUDA
|
||||
"""
|
||||
import gc
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from core import FileProcessingError
|
||||
from typing import Dict, Literal, Optional, Tuple
|
||||
|
||||
import whisper
|
||||
|
||||
from config import settings
|
||||
from services import vram_manager
|
||||
from services.gpu_detector import gpu_detector
|
||||
from .base_processor import FileProcessor
|
||||
|
||||
try:
|
||||
import whisper
|
||||
import torch
|
||||
WHISPER_AVAILABLE = True
|
||||
except ImportError:
|
||||
WHISPER_AVAILABLE = False
|
||||
from services.vram_manager import vram_manager
|
||||
|
||||
|
||||
class AudioProcessor(FileProcessor):
|
||||
"""Processor for audio files using Whisper"""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("AudioProcessor")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._model = None
|
||||
self._model_name = "medium" # Optimized for Spanish
|
||||
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if file is an audio file"""
|
||||
ext = self.get_file_extension(file_path)
|
||||
return ext in settings.AUDIO_EXTENSIONS
|
||||
# ============ CONFIGURACIÓN DE OPTIMIZACIONES ============
|
||||
|
||||
def _load_model(self):
|
||||
"""Load Whisper model lazily"""
|
||||
if not WHISPER_AVAILABLE:
|
||||
raise FileProcessingError("Whisper not installed")
|
||||
# CRÍTICO: Permite segmentos expandibles para reducir fragmentación
|
||||
os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
|
||||
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
||||
|
||||
if self._model is None:
|
||||
device = gpu_detector.get_device()
|
||||
self.logger.info(f"Loading Whisper model: {self._model_name} on {device}")
|
||||
self._model = whisper.load_model(self._model_name, device=device)
|
||||
vram_manager.update_usage()
|
||||
# Tamaños de modelos en GB (incluyendo overhead)
|
||||
MODEL_MEMORY_REQUIREMENTS = {
|
||||
"tiny": 0.5, "base": 0.8, "small": 1.5,
|
||||
"medium": 2.5, "large": 4.5,
|
||||
}
|
||||
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Transcribe audio file"""
|
||||
self.validate_file(file_path)
|
||||
audio_path = Path(file_path)
|
||||
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{audio_path.stem}.txt"
|
||||
# Cache global singleton - CLAVE para evitar OOM
|
||||
_model_cache: Dict[str, Tuple[whisper.Whisper, str, float]] = {}
|
||||
|
||||
self.logger.info(f"Processing audio file: {audio_path}")
|
||||
TRANSCRIPTION_TIMEOUT_SECONDS = 600
|
||||
MAX_RETRY_ATTEMPTS = 2
|
||||
RETRY_DELAY_SECONDS = 5
|
||||
|
||||
|
||||
# ============ FUNCIONES DE GESTIÓN DE MEMORIA ============
|
||||
|
||||
def get_gpu_memory_info() -> Dict[str, float]:
|
||||
"""Obtiene información de memoria GPU en GB."""
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
props = torch.cuda.get_device_properties(0)
|
||||
total = props.total_memory / (1024 ** 3)
|
||||
reserved = torch.cuda.memory_reserved(0) / (1024 ** 3)
|
||||
allocated = torch.cuda.memory_allocated(0) / (1024 ** 3)
|
||||
return {"total": total, "free": total - reserved, "used": allocated, "reserved": reserved}
|
||||
except Exception:
|
||||
pass
|
||||
return {"total": 0, "free": 0, "used": 0, "reserved": 0}
|
||||
|
||||
|
||||
def clear_cuda_cache(aggressive: bool = False) -> None:
|
||||
"""Limpia el cache de CUDA."""
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
if aggressive:
|
||||
for _ in range(3):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def check_memory_for_model(model_name: str) -> Tuple[bool, str]:
|
||||
"""Verifica si hay memoria suficiente para el modelo."""
|
||||
required = MODEL_MEMORY_REQUIREMENTS.get(model_name, 2.0)
|
||||
gpu_info = get_gpu_memory_info()
|
||||
|
||||
if gpu_info["total"] == 0:
|
||||
return False, "cpu"
|
||||
|
||||
needed = required * 1.5
|
||||
if gpu_info["free"] >= needed:
|
||||
return True, "cuda"
|
||||
elif gpu_info["free"] >= required:
|
||||
return True, "cuda"
|
||||
else:
|
||||
logger.warning(f"Memoria GPU insuficiente para '{model_name}': {gpu_info['free']:.2f}GB libre, {required:.2f}GB necesario")
|
||||
return False, "cpu"
|
||||
|
||||
|
||||
def get_cached_model(model_name: str, device: str) -> Optional[whisper.Whisper]:
|
||||
"""Obtiene modelo desde cache global."""
|
||||
cache_key = f"{model_name}_{device}"
|
||||
if cache_key in _model_cache:
|
||||
model, cached_device, _ = _model_cache[cache_key]
|
||||
if cached_device == device:
|
||||
logger.info(f"Modelo '{model_name}' desde cache global")
|
||||
_model_cache[cache_key] = (model, cached_device, time.time())
|
||||
return model
|
||||
del _model_cache[cache_key]
|
||||
return None
|
||||
|
||||
|
||||
def cache_model(model_name: str, model: whisper.Whisper, device: str) -> None:
|
||||
"""Almacena modelo en cache global."""
|
||||
cache_key = f"{model_name}_{device}"
|
||||
_model_cache[cache_key] = (model, device, time.time())
|
||||
logger.info(f"Modelo '{model_name}' cacheado en {device}")
|
||||
|
||||
|
||||
def clear_model_cache() -> None:
|
||||
"""Limpia todo el cache de modelos."""
|
||||
global _model_cache
|
||||
for cache_key, (model, _, _) in list(_model_cache.items()):
|
||||
try:
|
||||
del model
|
||||
except Exception:
|
||||
pass
|
||||
_model_cache.clear()
|
||||
clear_cuda_cache(aggressive=True)
|
||||
|
||||
|
||||
# ============ EXCEPCIONES ============
|
||||
|
||||
class AudioProcessingError(Exception):
|
||||
"""Error específico para fallos en el procesamiento de audio."""
|
||||
pass
|
||||
|
||||
|
||||
class TranscriptionTimeoutError(AudioProcessingError):
|
||||
"""Error cuando la transcripción excede el tiempo máximo."""
|
||||
pass
|
||||
|
||||
|
||||
class GPUOutOfMemoryError(AudioProcessingError):
|
||||
"""Error específico para CUDA OOM."""
|
||||
pass
|
||||
|
||||
|
||||
class AudioValidationError(AudioProcessingError):
|
||||
"""Error cuando el archivo de audio no pasa las validaciones."""
|
||||
pass
|
||||
|
||||
|
||||
# ============ PROCESADOR DE AUDIO ============
|
||||
|
||||
class AudioProcessor:
|
||||
"""Procesador de audio con cache global y fallback automático."""
|
||||
|
||||
SUPPORTED_MODELS = ("tiny", "base", "small", "medium", "large")
|
||||
DEFAULT_MODEL = settings.WHISPER_MODEL
|
||||
DEFAULT_LANGUAGE = "es"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: Optional[str] = None,
|
||||
language: Optional[str] = None,
|
||||
device: Optional[Literal["cuda", "rocm", "cpu", "auto"]] = None,
|
||||
) -> None:
|
||||
self._model_name = model_name or settings.WHISPER_MODEL
|
||||
self._language = language or self.DEFAULT_LANGUAGE
|
||||
self._device = device or "auto"
|
||||
self._model: Optional[whisper.Whisper] = None
|
||||
self._using_cpu_fallback = False
|
||||
self._model_id = f"whisper_{self._model_name}"
|
||||
|
||||
if self._model_name not in self.SUPPORTED_MODELS:
|
||||
raise ValueError(
|
||||
f"Modelo '{self._model_name}' no soportado. "
|
||||
f"Disponibles: {', '.join(self.SUPPORTED_MODELS)}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"AudioProcessor inicializado",
|
||||
extra={"model": self._model_name, "device": self._device},
|
||||
)
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
return self._model_name
|
||||
|
||||
@property
|
||||
def language(self) -> str:
|
||||
return self._language
|
||||
|
||||
@property
|
||||
def device(self) -> str:
|
||||
return getattr(self, "_resolved_device", self._device)
|
||||
|
||||
@property
|
||||
def is_loaded(self) -> bool:
|
||||
return self._model is not None
|
||||
|
||||
def _validate_audio_file(self, audio_path: Path) -> dict:
|
||||
"""Valida el archivo de audio."""
|
||||
logger.info(f"Validando: {audio_path.name}")
|
||||
|
||||
file_size = audio_path.stat().st_size
|
||||
if file_size < 1024:
|
||||
raise AudioValidationError("Archivo demasiado pequeño")
|
||||
if file_size > 500 * 1024 * 1024:
|
||||
logger.warning(f"Archivo grande: {file_size / 1024 / 1024:.1f}MB")
|
||||
|
||||
try:
|
||||
# Load model if needed
|
||||
self._load_model()
|
||||
cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||||
"-show_entries", "stream=channels,sample_rate,codec_name",
|
||||
"-of", "json", str(audio_path)]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
|
||||
if result.returncode == 0:
|
||||
import json
|
||||
info = json.loads(result.stdout)
|
||||
duration = float(info.get("format", {}).get("duration", 0))
|
||||
|
||||
for stream in info.get("streams", []):
|
||||
if stream.get("codec_type") == "audio":
|
||||
return {
|
||||
"duration": duration,
|
||||
"sample_rate": int(stream.get("sample_rate", 16000)),
|
||||
"channels": int(stream.get("channels", 1)),
|
||||
"codec": stream.get("codec_name", "unknown"),
|
||||
"size_bytes": file_size,
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Update VRAM usage
|
||||
vram_manager.update_usage()
|
||||
return {"duration": 0, "sample_rate": 16000, "channels": 1,
|
||||
"codec": "unknown", "size_bytes": file_size}
|
||||
|
||||
# Transcribe with torch.no_grad() for memory efficiency
|
||||
with torch.inference_mode():
|
||||
result = self._model.transcribe(
|
||||
str(audio_path),
|
||||
language="es",
|
||||
fp16=True,
|
||||
verbose=False
|
||||
def _convert_audio_with_ffmpeg(self, input_path: Path, output_format: str = "wav") -> Path:
|
||||
"""Convierte audio usando ffmpeg."""
|
||||
suffix = f".{output_format}"
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
output_path = Path(tmp.name)
|
||||
|
||||
cmd = ["ffmpeg", "-i", str(input_path),
|
||||
"-acodec", "pcm_s16le" if output_format == "wav" else "libmp3lame",
|
||||
"-ar", "16000", "-ac", "1", "-y", str(output_path)]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
|
||||
|
||||
if result.returncode != 0 or not output_path.exists():
|
||||
raise AudioProcessingError(f"ffmpeg falló: {result.stderr[-500:] if result.stderr else 'Unknown'}")
|
||||
|
||||
return output_path
|
||||
|
||||
def _get_device_with_memory_check(self) -> str:
|
||||
"""Detecta dispositivo verificando memoria disponible."""
|
||||
if self._device == "cpu":
|
||||
return "cpu"
|
||||
|
||||
if self._device == "auto":
|
||||
has_memory, recommended = check_memory_for_model(self._model_name)
|
||||
|
||||
if has_memory and recommended == "cuda":
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
logger.info(f"GPU detectada: {torch.cuda.get_device_name(0)}")
|
||||
return "cuda"
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
if not has_memory:
|
||||
logger.warning("Usando CPU por falta de memoria GPU")
|
||||
self._using_cpu_fallback = True
|
||||
return "cpu"
|
||||
|
||||
return self._device
|
||||
|
||||
def _load_model(self, force_reload: bool = False) -> None:
|
||||
"""Carga modelo usando cache global con optimizaciones de memoria."""
|
||||
if self._model is not None and not force_reload:
|
||||
return
|
||||
|
||||
# Configurar PyTorch para mejor gestión de memoria
|
||||
import os
|
||||
os.environ['PYTORCH_ALLOC_CONF'] = 'expandable_segments:True'
|
||||
|
||||
clear_cuda_cache(aggressive=True)
|
||||
self._resolved_device = self._get_device_with_memory_check()
|
||||
|
||||
# Verificar cache global
|
||||
if not force_reload:
|
||||
cached = get_cached_model(self._model_name, self._resolved_device)
|
||||
if cached is not None:
|
||||
self._model = cached
|
||||
return
|
||||
|
||||
try:
|
||||
# Cargar modelo con menos memoria inicial
|
||||
# Primero cargar en RAM, luego mover a GPU
|
||||
import torch
|
||||
with torch.cuda.device(self._resolved_device):
|
||||
self._model = whisper.load_model(
|
||||
self._model_name,
|
||||
device=self._resolved_device,
|
||||
download_root=None,
|
||||
in_memory=True # Reducir uso de disco
|
||||
)
|
||||
|
||||
# Save transcription
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(result["text"])
|
||||
# Limpiar cache después de cargar
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
self.logger.info(f"Transcription completed: {output_path}")
|
||||
cache_model(self._model_name, self._model, self._resolved_device)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"transcription_path": str(output_path),
|
||||
"text": result["text"],
|
||||
"model_used": self._model_name
|
||||
}
|
||||
gpu_info = get_gpu_memory_info()
|
||||
logger.info(
|
||||
f"Modelo cargado en {self._resolved_device}",
|
||||
extra={"gpu_used_gb": round(gpu_info.get("used", 0), 2),
|
||||
"gpu_free_gb": round(gpu_info.get("free", 0), 2)},
|
||||
)
|
||||
vram_manager.update_usage(self._model_id)
|
||||
|
||||
except RuntimeError as e:
|
||||
error_str = str(e)
|
||||
if "out of memory" in error_str.lower():
|
||||
# NUNCA usar CPU - limpiar GPU y reintentar
|
||||
logger.error(f"OOM en GPU - limpiando memoria para reintentar...")
|
||||
clear_cuda_cache(aggressive=True)
|
||||
raise AudioProcessingError(f"CUDA OOM - limpie la GPU y reintente. {error_str}") from e
|
||||
else:
|
||||
raise AudioProcessingError(f"Error cargando modelo: {e}") from e
|
||||
except Exception as e:
|
||||
self.logger.error(f"Audio processing failed: {e}")
|
||||
raise FileProcessingError(f"Audio processing failed: {e}")
|
||||
raise AudioProcessingError(f"Error cargando modelo: {e}") from e
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup model"""
|
||||
def _transcribe_internal(self, audio_path: Path, audio_properties: dict) -> str:
|
||||
"""Ejecuta la transcripción real."""
|
||||
result = self._model.transcribe(
|
||||
str(audio_path),
|
||||
language=self._language,
|
||||
fp16=self._resolved_device in ("cuda", "rocm"),
|
||||
verbose=False,
|
||||
)
|
||||
return result.get("text", "").strip()
|
||||
|
||||
def transcribe(self, audio_path: str) -> str:
|
||||
"""Transcribe un archivo de audio."""
|
||||
audio_file = Path(audio_path)
|
||||
if not audio_file.exists():
|
||||
raise FileNotFoundError(f"Archivo no encontrado: {audio_path}")
|
||||
|
||||
vram_manager.update_usage(self._model_id)
|
||||
|
||||
try:
|
||||
audio_properties = self._validate_audio_file(audio_file)
|
||||
except AudioValidationError as e:
|
||||
logger.error(f"Validación falló: {e}")
|
||||
raise
|
||||
|
||||
converted_file: Optional[Path] = None
|
||||
last_error: Optional[Exception] = None
|
||||
|
||||
for attempt in range(MAX_RETRY_ATTEMPTS):
|
||||
try:
|
||||
force_reload = attempt > 0
|
||||
if self._model is None or force_reload:
|
||||
self._load_model(force_reload=force_reload)
|
||||
|
||||
audio_to_transcribe = audio_file
|
||||
cleanup_converted = False
|
||||
|
||||
needs_conversion = (
|
||||
audio_file.suffix.lower() not in {".wav", ".mp3"} or
|
||||
audio_properties.get("codec") in ("aac", "opus", "vorbis") or
|
||||
audio_properties.get("channels", 1) > 1
|
||||
)
|
||||
|
||||
if needs_conversion:
|
||||
try:
|
||||
converted_file = self._convert_audio_with_ffmpeg(audio_file, "wav")
|
||||
audio_to_transcribe = converted_file
|
||||
cleanup_converted = True
|
||||
except AudioProcessingError as e:
|
||||
logger.warning(f"Conversión falló: {e}")
|
||||
|
||||
logger.info(
|
||||
f"Transcribiendo: {audio_file.name}",
|
||||
extra={"device": self._resolved_device, "cpu_fallback": self._using_cpu_fallback},
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
future = executor.submit(self._transcribe_internal, audio_to_transcribe, audio_properties)
|
||||
try:
|
||||
text = future.result(timeout=TRANSCRIPTION_TIMEOUT_SECONDS)
|
||||
except FutureTimeoutError:
|
||||
self.unload()
|
||||
raise TranscriptionTimeoutError(f"Timeout después de {TRANSCRIPTION_TIMEOUT_SECONDS}s")
|
||||
|
||||
logger.info(f"Transcripción completada: {len(text)} caracteres")
|
||||
return text
|
||||
|
||||
except RuntimeError as e:
|
||||
error_str = str(e)
|
||||
last_error = e
|
||||
|
||||
if "out of memory" in error_str.lower():
|
||||
logger.warning("OOM durante transcripción...")
|
||||
clear_cuda_cache(aggressive=True)
|
||||
|
||||
if not self._using_cpu_fallback and self._resolved_device in ("cuda", "rocm"):
|
||||
self.unload()
|
||||
self._resolved_device = "cpu"
|
||||
self._using_cpu_fallback = True
|
||||
self._load_model()
|
||||
continue
|
||||
|
||||
if attempt >= MAX_RETRY_ATTEMPTS - 1:
|
||||
raise GPUOutOfMemoryError("Memoria GPU insuficiente") from e
|
||||
time.sleep(RETRY_DELAY_SECONDS)
|
||||
continue
|
||||
|
||||
if "Key and Value must have the same sequence length" in error_str:
|
||||
if not converted_file:
|
||||
converted_file = self._convert_audio_with_ffmpeg(audio_file, "wav")
|
||||
text = self._model.transcribe(
|
||||
str(converted_file), language=self._language,
|
||||
fp16=self._resolved_device in ("cuda", "rocm"), verbose=False
|
||||
).get("text", "").strip()
|
||||
converted_file.unlink()
|
||||
return text
|
||||
|
||||
raise AudioProcessingError(f"Error de transcripción: {e}") from e
|
||||
|
||||
except (TranscriptionTimeoutError, GPUOutOfMemoryError):
|
||||
raise
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
self.unload()
|
||||
|
||||
if attempt >= MAX_RETRY_ATTEMPTS - 1:
|
||||
raise AudioProcessingError(f"Error después de {MAX_RETRY_ATTEMPTS} intentos: {e}") from e
|
||||
|
||||
time.sleep(RETRY_DELAY_SECONDS)
|
||||
|
||||
finally:
|
||||
if converted_file and converted_file.exists():
|
||||
try:
|
||||
converted_file.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
raise AudioProcessingError(f"Error al transcribir: {last_error}") from last_error
|
||||
|
||||
def unload(self) -> None:
|
||||
"""Descarga la referencia local del modelo."""
|
||||
if self._model is not None:
|
||||
del self._model
|
||||
self._model = None
|
||||
vram_manager.cleanup()
|
||||
clear_cuda_cache(aggressive=False)
|
||||
vram_manager.unregister_model(self._model_id)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"AudioProcessor(model='{self._model_name}', device='{self.device}', loaded={self.is_loaded})"
|
||||
|
||||
def __del__(self) -> None:
|
||||
try:
|
||||
self.unload()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
"""
|
||||
Base File Processor (Strategy Pattern)
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from core import FileProcessingError
|
||||
|
||||
|
||||
class FileProcessor(ABC):
|
||||
"""Abstract base class for file processors"""
|
||||
|
||||
def __init__(self, name: str):
|
||||
self.name = name
|
||||
|
||||
@abstractmethod
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if processor can handle this file type"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Process the file"""
|
||||
pass
|
||||
|
||||
def get_file_extension(self, file_path: str) -> str:
|
||||
"""Get file extension from path"""
|
||||
return Path(file_path).suffix.lower()
|
||||
|
||||
def get_base_name(self, file_path: str) -> str:
|
||||
"""Get base name without extension"""
|
||||
return Path(file_path).stem
|
||||
|
||||
def validate_file(self, file_path: str) -> None:
|
||||
"""Validate file exists and is accessible"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise FileProcessingError(f"File not found: {file_path}")
|
||||
if not path.is_file():
|
||||
raise FileProcessingError(f"Path is not a file: {file_path}")
|
||||
@@ -1,164 +0,0 @@
|
||||
"""
|
||||
PDF file processor with OCR
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from core import FileProcessingError
|
||||
from config import settings
|
||||
from services import vram_manager
|
||||
from services.gpu_detector import gpu_detector
|
||||
from .base_processor import FileProcessor
|
||||
|
||||
try:
|
||||
import torch
|
||||
import pytesseract
|
||||
import easyocr
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pdf2image import convert_from_path
|
||||
from PIL import Image
|
||||
PDF_OCR_AVAILABLE = True
|
||||
except ImportError:
|
||||
PDF_OCR_AVAILABLE = False
|
||||
# Provide stub for type hints
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
Image = None # type: ignore
|
||||
|
||||
|
||||
class PDFProcessor(FileProcessor):
|
||||
"""Processor for PDF files with OCR"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("PDFProcessor")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._easyocr_reader = None
|
||||
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if file is a PDF"""
|
||||
return self.get_file_extension(file_path) == ".pdf"
|
||||
|
||||
def _load_easyocr(self):
|
||||
"""Load EasyOCR reader"""
|
||||
if self._easyocr_reader is None:
|
||||
use_gpu = gpu_detector.is_available()
|
||||
self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})")
|
||||
self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu)
|
||||
vram_manager.update_usage()
|
||||
|
||||
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
||||
"""Preprocess image for better OCR"""
|
||||
# Convert to grayscale
|
||||
if image.mode != 'L':
|
||||
image = image.convert('L')
|
||||
|
||||
# Simple preprocessing
|
||||
image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS)
|
||||
|
||||
return image
|
||||
|
||||
def _run_ocr_parallel(self, pil_images) -> Dict[str, list]:
|
||||
"""Run all OCR engines in parallel"""
|
||||
results = {
|
||||
'easyocr': [''] * len(pil_images),
|
||||
'tesseract': [''] * len(pil_images)
|
||||
}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2) as executor:
|
||||
futures = {}
|
||||
|
||||
# EasyOCR
|
||||
if self._easyocr_reader:
|
||||
futures['easyocr'] = executor.submit(
|
||||
self._easyocr_reader.readtext_batched,
|
||||
pil_images,
|
||||
detail=0
|
||||
)
|
||||
|
||||
# Tesseract
|
||||
futures['tesseract'] = executor.submit(
|
||||
lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs],
|
||||
pil_images
|
||||
)
|
||||
|
||||
# Collect results
|
||||
for name, future in futures.items():
|
||||
try:
|
||||
results[name] = future.result()
|
||||
except Exception as e:
|
||||
self.logger.error(f"OCR engine {name} failed: {e}")
|
||||
results[name] = [''] * len(pil_images)
|
||||
|
||||
return results
|
||||
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Process PDF with OCR"""
|
||||
self.validate_file(file_path)
|
||||
pdf_path = Path(file_path)
|
||||
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt"
|
||||
|
||||
if not PDF_OCR_AVAILABLE:
|
||||
raise FileProcessingError("PDF OCR dependencies not installed")
|
||||
|
||||
self.logger.info(f"Processing PDF file: {pdf_path}")
|
||||
|
||||
try:
|
||||
# Load EasyOCR if needed
|
||||
self._load_easyocr()
|
||||
vram_manager.update_usage()
|
||||
|
||||
# Convert PDF to images
|
||||
self.logger.debug("Converting PDF to images")
|
||||
pil_images = convert_from_path(
|
||||
str(pdf_path),
|
||||
dpi=settings.PDF_DPI,
|
||||
fmt='png',
|
||||
thread_count=settings.PDF_RENDER_THREAD_COUNT
|
||||
)
|
||||
|
||||
# Process in batches
|
||||
all_text = []
|
||||
batch_size = settings.PDF_BATCH_SIZE
|
||||
|
||||
for i in range(0, len(pil_images), batch_size):
|
||||
batch = pil_images[i:i + batch_size]
|
||||
self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}")
|
||||
|
||||
# Preprocess images
|
||||
preprocessed_batch = [self._preprocess_image(img) for img in batch]
|
||||
|
||||
# Run OCR in parallel
|
||||
ocr_results = self._run_ocr_parallel(preprocessed_batch)
|
||||
|
||||
# Combine results
|
||||
for j, img in enumerate(batch):
|
||||
# Take best result (simple approach: try EasyOCR first, then Tesseract)
|
||||
text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j]
|
||||
if text:
|
||||
all_text.append(text)
|
||||
|
||||
# Save combined text
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write("\n\n".join(all_text))
|
||||
|
||||
self.logger.info(f"PDF processing completed: {output_path}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text_path": str(output_path),
|
||||
"text": "\n\n".join(all_text),
|
||||
"pages_processed": len(pil_images)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"PDF processing failed: {e}")
|
||||
raise FileProcessingError(f"PDF processing failed: {e}")
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup OCR models"""
|
||||
self._easyocr_reader = None
|
||||
vram_manager.cleanup()
|
||||
@@ -1,55 +0,0 @@
|
||||
"""
|
||||
Text file processor
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from core import FileProcessingError
|
||||
from config import settings
|
||||
from .base_processor import FileProcessor
|
||||
|
||||
|
||||
class TextProcessor(FileProcessor):
|
||||
"""Processor for text files"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("TextProcessor")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if file is a text file"""
|
||||
ext = self.get_file_extension(file_path)
|
||||
return ext in settings.TXT_EXTENSIONS
|
||||
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Process text file (copy to downloads)"""
|
||||
self.validate_file(file_path)
|
||||
text_path = Path(file_path)
|
||||
output_path = settings.LOCAL_DOWNLOADS_PATH / text_path.name
|
||||
|
||||
self.logger.info(f"Processing text file: {text_path}")
|
||||
|
||||
try:
|
||||
# Copy file to downloads directory
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(text_path, 'r', encoding='utf-8') as src:
|
||||
with open(output_path, 'w', encoding='utf-8') as dst:
|
||||
dst.write(src.read())
|
||||
|
||||
self.logger.info(f"Text file processing completed: {output_path}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text_path": str(output_path),
|
||||
"text": self._read_file(output_path)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Text processing failed: {e}")
|
||||
raise FileProcessingError(f"Text processing failed: {e}")
|
||||
|
||||
def _read_file(self, file_path: Path) -> str:
|
||||
"""Read file content"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
Reference in New Issue
Block a user