feat: Sistema CBCFacil completo con cola secuencial

- Implementa ProcessingMonitor singleton para procesamiento secuencial de archivos
- Agrega AI summary service con soporte para MiniMax API
- Agrega PDF generator para resúmenes
- Agrega watchers para monitoreo de carpeta remota
- Mejora sistema de notificaciones Telegram
- Implementa gestión de VRAM para GPU
- Configuración mediante variables de entorno (sin hardcoded secrets)
- .env y transcriptions/ agregados a .gitignore

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
renato97
2026-02-25 15:35:39 +00:00
parent dcf887c510
commit ee8fc183be
77 changed files with 3734 additions and 20263 deletions

View File

@@ -1,15 +1,5 @@
"""
Processors package for CBCFacil
"""
"""Procesadores de documentos y medios."""
from .base_processor import FileProcessor
from .audio_processor import AudioProcessor
from .pdf_processor import PDFProcessor
from .text_processor import TextProcessor
from processors.audio_processor import AudioProcessor, AudioProcessingError
__all__ = [
'FileProcessor',
'AudioProcessor',
'PDFProcessor',
'TextProcessor'
]
__all__ = ["AudioProcessor", "AudioProcessingError"]

View File

@@ -1,93 +1,467 @@
"""
Audio file processor using Whisper
Procesador de audio para transcripción con Whisper.
OPTIMIZACIONES DE MEMORIA PARA GPUs DE 8GB:
- Cache global singleton para evitar carga múltiple del modelo
- Configuración PYTORCH_ALLOC_CONF para reducir fragmentación
- Verificación de VRAM antes de cargar
- Fallback automático a CPU si GPU OOM
- Limpieza agresiva de cache CUDA
"""
import gc
import logging
import os
import subprocess
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
from pathlib import Path
from typing import Dict, Any
from core import FileProcessingError
from typing import Dict, Literal, Optional, Tuple
import whisper
from config import settings
from services import vram_manager
from services.gpu_detector import gpu_detector
from .base_processor import FileProcessor
try:
import whisper
import torch
WHISPER_AVAILABLE = True
except ImportError:
WHISPER_AVAILABLE = False
from services.vram_manager import vram_manager
class AudioProcessor(FileProcessor):
"""Processor for audio files using Whisper"""
logger = logging.getLogger(__name__)
def __init__(self):
super().__init__("AudioProcessor")
self.logger = logging.getLogger(__name__)
self._model = None
self._model_name = "medium" # Optimized for Spanish
def can_process(self, file_path: str) -> bool:
"""Check if file is an audio file"""
ext = self.get_file_extension(file_path)
return ext in settings.AUDIO_EXTENSIONS
# ============ CONFIGURACIÓN DE OPTIMIZACIONES ============
def _load_model(self):
"""Load Whisper model lazily"""
if not WHISPER_AVAILABLE:
raise FileProcessingError("Whisper not installed")
# CRÍTICO: Permite segmentos expandibles para reducir fragmentación
os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
if self._model is None:
device = gpu_detector.get_device()
self.logger.info(f"Loading Whisper model: {self._model_name} on {device}")
self._model = whisper.load_model(self._model_name, device=device)
vram_manager.update_usage()
# Tamaños de modelos en GB (incluyendo overhead)
MODEL_MEMORY_REQUIREMENTS = {
"tiny": 0.5, "base": 0.8, "small": 1.5,
"medium": 2.5, "large": 4.5,
}
def process(self, file_path: str) -> Dict[str, Any]:
"""Transcribe audio file"""
self.validate_file(file_path)
audio_path = Path(file_path)
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{audio_path.stem}.txt"
# Cache global singleton - CLAVE para evitar OOM
_model_cache: Dict[str, Tuple[whisper.Whisper, str, float]] = {}
self.logger.info(f"Processing audio file: {audio_path}")
TRANSCRIPTION_TIMEOUT_SECONDS = 600
MAX_RETRY_ATTEMPTS = 2
RETRY_DELAY_SECONDS = 5
# ============ FUNCIONES DE GESTIÓN DE MEMORIA ============
def get_gpu_memory_info() -> Dict[str, float]:
"""Obtiene información de memoria GPU en GB."""
try:
import torch
if torch.cuda.is_available():
props = torch.cuda.get_device_properties(0)
total = props.total_memory / (1024 ** 3)
reserved = torch.cuda.memory_reserved(0) / (1024 ** 3)
allocated = torch.cuda.memory_allocated(0) / (1024 ** 3)
return {"total": total, "free": total - reserved, "used": allocated, "reserved": reserved}
except Exception:
pass
return {"total": 0, "free": 0, "used": 0, "reserved": 0}
def clear_cuda_cache(aggressive: bool = False) -> None:
"""Limpia el cache de CUDA."""
try:
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
if aggressive:
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()
except Exception:
pass
def check_memory_for_model(model_name: str) -> Tuple[bool, str]:
"""Verifica si hay memoria suficiente para el modelo."""
required = MODEL_MEMORY_REQUIREMENTS.get(model_name, 2.0)
gpu_info = get_gpu_memory_info()
if gpu_info["total"] == 0:
return False, "cpu"
needed = required * 1.5
if gpu_info["free"] >= needed:
return True, "cuda"
elif gpu_info["free"] >= required:
return True, "cuda"
else:
logger.warning(f"Memoria GPU insuficiente para '{model_name}': {gpu_info['free']:.2f}GB libre, {required:.2f}GB necesario")
return False, "cpu"
def get_cached_model(model_name: str, device: str) -> Optional[whisper.Whisper]:
"""Obtiene modelo desde cache global."""
cache_key = f"{model_name}_{device}"
if cache_key in _model_cache:
model, cached_device, _ = _model_cache[cache_key]
if cached_device == device:
logger.info(f"Modelo '{model_name}' desde cache global")
_model_cache[cache_key] = (model, cached_device, time.time())
return model
del _model_cache[cache_key]
return None
def cache_model(model_name: str, model: whisper.Whisper, device: str) -> None:
"""Almacena modelo en cache global."""
cache_key = f"{model_name}_{device}"
_model_cache[cache_key] = (model, device, time.time())
logger.info(f"Modelo '{model_name}' cacheado en {device}")
def clear_model_cache() -> None:
"""Limpia todo el cache de modelos."""
global _model_cache
for cache_key, (model, _, _) in list(_model_cache.items()):
try:
del model
except Exception:
pass
_model_cache.clear()
clear_cuda_cache(aggressive=True)
# ============ EXCEPCIONES ============
class AudioProcessingError(Exception):
"""Error específico para fallos en el procesamiento de audio."""
pass
class TranscriptionTimeoutError(AudioProcessingError):
"""Error cuando la transcripción excede el tiempo máximo."""
pass
class GPUOutOfMemoryError(AudioProcessingError):
"""Error específico para CUDA OOM."""
pass
class AudioValidationError(AudioProcessingError):
"""Error cuando el archivo de audio no pasa las validaciones."""
pass
# ============ PROCESADOR DE AUDIO ============
class AudioProcessor:
"""Procesador de audio con cache global y fallback automático."""
SUPPORTED_MODELS = ("tiny", "base", "small", "medium", "large")
DEFAULT_MODEL = settings.WHISPER_MODEL
DEFAULT_LANGUAGE = "es"
def __init__(
self,
model_name: Optional[str] = None,
language: Optional[str] = None,
device: Optional[Literal["cuda", "rocm", "cpu", "auto"]] = None,
) -> None:
self._model_name = model_name or settings.WHISPER_MODEL
self._language = language or self.DEFAULT_LANGUAGE
self._device = device or "auto"
self._model: Optional[whisper.Whisper] = None
self._using_cpu_fallback = False
self._model_id = f"whisper_{self._model_name}"
if self._model_name not in self.SUPPORTED_MODELS:
raise ValueError(
f"Modelo '{self._model_name}' no soportado. "
f"Disponibles: {', '.join(self.SUPPORTED_MODELS)}"
)
logger.info(
"AudioProcessor inicializado",
extra={"model": self._model_name, "device": self._device},
)
@property
def model_name(self) -> str:
return self._model_name
@property
def language(self) -> str:
return self._language
@property
def device(self) -> str:
return getattr(self, "_resolved_device", self._device)
@property
def is_loaded(self) -> bool:
return self._model is not None
def _validate_audio_file(self, audio_path: Path) -> dict:
"""Valida el archivo de audio."""
logger.info(f"Validando: {audio_path.name}")
file_size = audio_path.stat().st_size
if file_size < 1024:
raise AudioValidationError("Archivo demasiado pequeño")
if file_size > 500 * 1024 * 1024:
logger.warning(f"Archivo grande: {file_size / 1024 / 1024:.1f}MB")
try:
# Load model if needed
self._load_model()
cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
"-show_entries", "stream=channels,sample_rate,codec_name",
"-of", "json", str(audio_path)]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if result.returncode == 0:
import json
info = json.loads(result.stdout)
duration = float(info.get("format", {}).get("duration", 0))
for stream in info.get("streams", []):
if stream.get("codec_type") == "audio":
return {
"duration": duration,
"sample_rate": int(stream.get("sample_rate", 16000)),
"channels": int(stream.get("channels", 1)),
"codec": stream.get("codec_name", "unknown"),
"size_bytes": file_size,
}
except Exception:
pass
# Update VRAM usage
vram_manager.update_usage()
return {"duration": 0, "sample_rate": 16000, "channels": 1,
"codec": "unknown", "size_bytes": file_size}
# Transcribe with torch.no_grad() for memory efficiency
with torch.inference_mode():
result = self._model.transcribe(
str(audio_path),
language="es",
fp16=True,
verbose=False
def _convert_audio_with_ffmpeg(self, input_path: Path, output_format: str = "wav") -> Path:
"""Convierte audio usando ffmpeg."""
suffix = f".{output_format}"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
output_path = Path(tmp.name)
cmd = ["ffmpeg", "-i", str(input_path),
"-acodec", "pcm_s16le" if output_format == "wav" else "libmp3lame",
"-ar", "16000", "-ac", "1", "-y", str(output_path)]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0 or not output_path.exists():
raise AudioProcessingError(f"ffmpeg falló: {result.stderr[-500:] if result.stderr else 'Unknown'}")
return output_path
def _get_device_with_memory_check(self) -> str:
"""Detecta dispositivo verificando memoria disponible."""
if self._device == "cpu":
return "cpu"
if self._device == "auto":
has_memory, recommended = check_memory_for_model(self._model_name)
if has_memory and recommended == "cuda":
try:
import torch
if torch.cuda.is_available():
logger.info(f"GPU detectada: {torch.cuda.get_device_name(0)}")
return "cuda"
except ImportError:
pass
if not has_memory:
logger.warning("Usando CPU por falta de memoria GPU")
self._using_cpu_fallback = True
return "cpu"
return self._device
def _load_model(self, force_reload: bool = False) -> None:
"""Carga modelo usando cache global con optimizaciones de memoria."""
if self._model is not None and not force_reload:
return
# Configurar PyTorch para mejor gestión de memoria
import os
os.environ['PYTORCH_ALLOC_CONF'] = 'expandable_segments:True'
clear_cuda_cache(aggressive=True)
self._resolved_device = self._get_device_with_memory_check()
# Verificar cache global
if not force_reload:
cached = get_cached_model(self._model_name, self._resolved_device)
if cached is not None:
self._model = cached
return
try:
# Cargar modelo con menos memoria inicial
# Primero cargar en RAM, luego mover a GPU
import torch
with torch.cuda.device(self._resolved_device):
self._model = whisper.load_model(
self._model_name,
device=self._resolved_device,
download_root=None,
in_memory=True # Reducir uso de disco
)
# Save transcription
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(result["text"])
# Limpiar cache después de cargar
torch.cuda.empty_cache()
self.logger.info(f"Transcription completed: {output_path}")
cache_model(self._model_name, self._model, self._resolved_device)
return {
"success": True,
"transcription_path": str(output_path),
"text": result["text"],
"model_used": self._model_name
}
gpu_info = get_gpu_memory_info()
logger.info(
f"Modelo cargado en {self._resolved_device}",
extra={"gpu_used_gb": round(gpu_info.get("used", 0), 2),
"gpu_free_gb": round(gpu_info.get("free", 0), 2)},
)
vram_manager.update_usage(self._model_id)
except RuntimeError as e:
error_str = str(e)
if "out of memory" in error_str.lower():
# NUNCA usar CPU - limpiar GPU y reintentar
logger.error(f"OOM en GPU - limpiando memoria para reintentar...")
clear_cuda_cache(aggressive=True)
raise AudioProcessingError(f"CUDA OOM - limpie la GPU y reintente. {error_str}") from e
else:
raise AudioProcessingError(f"Error cargando modelo: {e}") from e
except Exception as e:
self.logger.error(f"Audio processing failed: {e}")
raise FileProcessingError(f"Audio processing failed: {e}")
raise AudioProcessingError(f"Error cargando modelo: {e}") from e
def cleanup(self) -> None:
"""Cleanup model"""
def _transcribe_internal(self, audio_path: Path, audio_properties: dict) -> str:
"""Ejecuta la transcripción real."""
result = self._model.transcribe(
str(audio_path),
language=self._language,
fp16=self._resolved_device in ("cuda", "rocm"),
verbose=False,
)
return result.get("text", "").strip()
def transcribe(self, audio_path: str) -> str:
"""Transcribe un archivo de audio."""
audio_file = Path(audio_path)
if not audio_file.exists():
raise FileNotFoundError(f"Archivo no encontrado: {audio_path}")
vram_manager.update_usage(self._model_id)
try:
audio_properties = self._validate_audio_file(audio_file)
except AudioValidationError as e:
logger.error(f"Validación falló: {e}")
raise
converted_file: Optional[Path] = None
last_error: Optional[Exception] = None
for attempt in range(MAX_RETRY_ATTEMPTS):
try:
force_reload = attempt > 0
if self._model is None or force_reload:
self._load_model(force_reload=force_reload)
audio_to_transcribe = audio_file
cleanup_converted = False
needs_conversion = (
audio_file.suffix.lower() not in {".wav", ".mp3"} or
audio_properties.get("codec") in ("aac", "opus", "vorbis") or
audio_properties.get("channels", 1) > 1
)
if needs_conversion:
try:
converted_file = self._convert_audio_with_ffmpeg(audio_file, "wav")
audio_to_transcribe = converted_file
cleanup_converted = True
except AudioProcessingError as e:
logger.warning(f"Conversión falló: {e}")
logger.info(
f"Transcribiendo: {audio_file.name}",
extra={"device": self._resolved_device, "cpu_fallback": self._using_cpu_fallback},
)
with ThreadPoolExecutor(max_workers=1) as executor:
future = executor.submit(self._transcribe_internal, audio_to_transcribe, audio_properties)
try:
text = future.result(timeout=TRANSCRIPTION_TIMEOUT_SECONDS)
except FutureTimeoutError:
self.unload()
raise TranscriptionTimeoutError(f"Timeout después de {TRANSCRIPTION_TIMEOUT_SECONDS}s")
logger.info(f"Transcripción completada: {len(text)} caracteres")
return text
except RuntimeError as e:
error_str = str(e)
last_error = e
if "out of memory" in error_str.lower():
logger.warning("OOM durante transcripción...")
clear_cuda_cache(aggressive=True)
if not self._using_cpu_fallback and self._resolved_device in ("cuda", "rocm"):
self.unload()
self._resolved_device = "cpu"
self._using_cpu_fallback = True
self._load_model()
continue
if attempt >= MAX_RETRY_ATTEMPTS - 1:
raise GPUOutOfMemoryError("Memoria GPU insuficiente") from e
time.sleep(RETRY_DELAY_SECONDS)
continue
if "Key and Value must have the same sequence length" in error_str:
if not converted_file:
converted_file = self._convert_audio_with_ffmpeg(audio_file, "wav")
text = self._model.transcribe(
str(converted_file), language=self._language,
fp16=self._resolved_device in ("cuda", "rocm"), verbose=False
).get("text", "").strip()
converted_file.unlink()
return text
raise AudioProcessingError(f"Error de transcripción: {e}") from e
except (TranscriptionTimeoutError, GPUOutOfMemoryError):
raise
except Exception as e:
last_error = e
self.unload()
if attempt >= MAX_RETRY_ATTEMPTS - 1:
raise AudioProcessingError(f"Error después de {MAX_RETRY_ATTEMPTS} intentos: {e}") from e
time.sleep(RETRY_DELAY_SECONDS)
finally:
if converted_file and converted_file.exists():
try:
converted_file.unlink()
except Exception:
pass
raise AudioProcessingError(f"Error al transcribir: {last_error}") from last_error
def unload(self) -> None:
"""Descarga la referencia local del modelo."""
if self._model is not None:
del self._model
self._model = None
vram_manager.cleanup()
clear_cuda_cache(aggressive=False)
vram_manager.unregister_model(self._model_id)
def __repr__(self) -> str:
return f"AudioProcessor(model='{self._model_name}', device='{self.device}', loaded={self.is_loaded})"
def __del__(self) -> None:
try:
self.unload()
except Exception:
pass

View File

@@ -1,40 +0,0 @@
"""
Base File Processor (Strategy Pattern)
"""
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, Any, Optional
from core import FileProcessingError
class FileProcessor(ABC):
"""Abstract base class for file processors"""
def __init__(self, name: str):
self.name = name
@abstractmethod
def can_process(self, file_path: str) -> bool:
"""Check if processor can handle this file type"""
pass
@abstractmethod
def process(self, file_path: str) -> Dict[str, Any]:
"""Process the file"""
pass
def get_file_extension(self, file_path: str) -> str:
"""Get file extension from path"""
return Path(file_path).suffix.lower()
def get_base_name(self, file_path: str) -> str:
"""Get base name without extension"""
return Path(file_path).stem
def validate_file(self, file_path: str) -> None:
"""Validate file exists and is accessible"""
path = Path(file_path)
if not path.exists():
raise FileProcessingError(f"File not found: {file_path}")
if not path.is_file():
raise FileProcessingError(f"Path is not a file: {file_path}")

View File

@@ -1,164 +0,0 @@
"""
PDF file processor with OCR
"""
import logging
from pathlib import Path
from typing import Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from core import FileProcessingError
from config import settings
from services import vram_manager
from services.gpu_detector import gpu_detector
from .base_processor import FileProcessor
try:
import torch
import pytesseract
import easyocr
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image
PDF_OCR_AVAILABLE = True
except ImportError:
PDF_OCR_AVAILABLE = False
# Provide stub for type hints
try:
from PIL import Image
except ImportError:
Image = None # type: ignore
class PDFProcessor(FileProcessor):
"""Processor for PDF files with OCR"""
def __init__(self):
super().__init__("PDFProcessor")
self.logger = logging.getLogger(__name__)
self._easyocr_reader = None
def can_process(self, file_path: str) -> bool:
"""Check if file is a PDF"""
return self.get_file_extension(file_path) == ".pdf"
def _load_easyocr(self):
"""Load EasyOCR reader"""
if self._easyocr_reader is None:
use_gpu = gpu_detector.is_available()
self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})")
self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu)
vram_manager.update_usage()
def _preprocess_image(self, image: Image.Image) -> Image.Image:
"""Preprocess image for better OCR"""
# Convert to grayscale
if image.mode != 'L':
image = image.convert('L')
# Simple preprocessing
image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS)
return image
def _run_ocr_parallel(self, pil_images) -> Dict[str, list]:
"""Run all OCR engines in parallel"""
results = {
'easyocr': [''] * len(pil_images),
'tesseract': [''] * len(pil_images)
}
with ThreadPoolExecutor(max_workers=2) as executor:
futures = {}
# EasyOCR
if self._easyocr_reader:
futures['easyocr'] = executor.submit(
self._easyocr_reader.readtext_batched,
pil_images,
detail=0
)
# Tesseract
futures['tesseract'] = executor.submit(
lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs],
pil_images
)
# Collect results
for name, future in futures.items():
try:
results[name] = future.result()
except Exception as e:
self.logger.error(f"OCR engine {name} failed: {e}")
results[name] = [''] * len(pil_images)
return results
def process(self, file_path: str) -> Dict[str, Any]:
"""Process PDF with OCR"""
self.validate_file(file_path)
pdf_path = Path(file_path)
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt"
if not PDF_OCR_AVAILABLE:
raise FileProcessingError("PDF OCR dependencies not installed")
self.logger.info(f"Processing PDF file: {pdf_path}")
try:
# Load EasyOCR if needed
self._load_easyocr()
vram_manager.update_usage()
# Convert PDF to images
self.logger.debug("Converting PDF to images")
pil_images = convert_from_path(
str(pdf_path),
dpi=settings.PDF_DPI,
fmt='png',
thread_count=settings.PDF_RENDER_THREAD_COUNT
)
# Process in batches
all_text = []
batch_size = settings.PDF_BATCH_SIZE
for i in range(0, len(pil_images), batch_size):
batch = pil_images[i:i + batch_size]
self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}")
# Preprocess images
preprocessed_batch = [self._preprocess_image(img) for img in batch]
# Run OCR in parallel
ocr_results = self._run_ocr_parallel(preprocessed_batch)
# Combine results
for j, img in enumerate(batch):
# Take best result (simple approach: try EasyOCR first, then Tesseract)
text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j]
if text:
all_text.append(text)
# Save combined text
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write("\n\n".join(all_text))
self.logger.info(f"PDF processing completed: {output_path}")
return {
"success": True,
"text_path": str(output_path),
"text": "\n\n".join(all_text),
"pages_processed": len(pil_images)
}
except Exception as e:
self.logger.error(f"PDF processing failed: {e}")
raise FileProcessingError(f"PDF processing failed: {e}")
def cleanup(self) -> None:
"""Cleanup OCR models"""
self._easyocr_reader = None
vram_manager.cleanup()

View File

@@ -1,55 +0,0 @@
"""
Text file processor
"""
import logging
from pathlib import Path
from typing import Dict, Any
from core import FileProcessingError
from config import settings
from .base_processor import FileProcessor
class TextProcessor(FileProcessor):
"""Processor for text files"""
def __init__(self):
super().__init__("TextProcessor")
self.logger = logging.getLogger(__name__)
def can_process(self, file_path: str) -> bool:
"""Check if file is a text file"""
ext = self.get_file_extension(file_path)
return ext in settings.TXT_EXTENSIONS
def process(self, file_path: str) -> Dict[str, Any]:
"""Process text file (copy to downloads)"""
self.validate_file(file_path)
text_path = Path(file_path)
output_path = settings.LOCAL_DOWNLOADS_PATH / text_path.name
self.logger.info(f"Processing text file: {text_path}")
try:
# Copy file to downloads directory
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(text_path, 'r', encoding='utf-8') as src:
with open(output_path, 'w', encoding='utf-8') as dst:
dst.write(src.read())
self.logger.info(f"Text file processing completed: {output_path}")
return {
"success": True,
"text_path": str(output_path),
"text": self._read_file(output_path)
}
except Exception as e:
self.logger.error(f"Text processing failed: {e}")
raise FileProcessingError(f"Text processing failed: {e}")
def _read_file(self, file_path: Path) -> str:
"""Read file content"""
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()