""" PDF file processor with OCR """ import logging from pathlib import Path from typing import Dict, Any from concurrent.futures import ThreadPoolExecutor, as_completed from ..core import FileProcessingError from ..config import settings from ..services import vram_manager from ..services.gpu_detector import gpu_detector from .base_processor import FileProcessor try: import torch import pytesseract import easyocr import cv2 import numpy as np from pdf2image import convert_from_path from PIL import Image PDF_OCR_AVAILABLE = True except ImportError: PDF_OCR_AVAILABLE = False class PDFProcessor(FileProcessor): """Processor for PDF files with OCR""" def __init__(self): super().__init__("PDFProcessor") self.logger = logging.getLogger(__name__) self._easyocr_reader = None def can_process(self, file_path: str) -> bool: """Check if file is a PDF""" return self.get_file_extension(file_path) == ".pdf" def _load_easyocr(self): """Load EasyOCR reader""" if self._easyocr_reader is None: use_gpu = gpu_detector.is_available() self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})") self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu) vram_manager.update_usage() def _preprocess_image(self, image: Image.Image) -> Image.Image: """Preprocess image for better OCR""" # Convert to grayscale if image.mode != 'L': image = image.convert('L') # Simple preprocessing image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS) return image def _run_ocr_parallel(self, pil_images) -> Dict[str, list]: """Run all OCR engines in parallel""" results = { 'easyocr': [''] * len(pil_images), 'tesseract': [''] * len(pil_images) } with ThreadPoolExecutor(max_workers=2) as executor: futures = {} # EasyOCR if self._easyocr_reader: futures['easyocr'] = executor.submit( self._easyocr_reader.readtext_batched, pil_images, detail=0 ) # Tesseract futures['tesseract'] = executor.submit( lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs], pil_images ) # Collect results for name, future in futures.items(): try: results[name] = future.result() except Exception as e: self.logger.error(f"OCR engine {name} failed: {e}") results[name] = [''] * len(pil_images) return results def process(self, file_path: str) -> Dict[str, Any]: """Process PDF with OCR""" self.validate_file(file_path) pdf_path = Path(file_path) output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt" if not PDF_OCR_AVAILABLE: raise FileProcessingError("PDF OCR dependencies not installed") self.logger.info(f"Processing PDF file: {pdf_path}") try: # Load EasyOCR if needed self._load_easyocr() vram_manager.update_usage() # Convert PDF to images self.logger.debug("Converting PDF to images") pil_images = convert_from_path( str(pdf_path), dpi=settings.PDF_DPI, fmt='png', thread_count=settings.PDF_RENDER_THREAD_COUNT ) # Process in batches all_text = [] batch_size = settings.PDF_BATCH_SIZE for i in range(0, len(pil_images), batch_size): batch = pil_images[i:i + batch_size] self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}") # Preprocess images preprocessed_batch = [self._preprocess_image(img) for img in batch] # Run OCR in parallel ocr_results = self._run_ocr_parallel(preprocessed_batch) # Combine results for j, img in enumerate(batch): # Take best result (simple approach: try EasyOCR first, then Tesseract) text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j] if text: all_text.append(text) # Save combined text output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: f.write("\n\n".join(all_text)) self.logger.info(f"PDF processing completed: {output_path}") return { "success": True, "text_path": str(output_path), "text": "\n\n".join(all_text), "pages_processed": len(pil_images) } except Exception as e: self.logger.error(f"PDF processing failed: {e}") raise FileProcessingError(f"PDF processing failed: {e}") def cleanup(self) -> None: """Cleanup OCR models""" self._easyocr_reader = None vram_manager.cleanup()