CBCFacil v8.0 - Refactored with AMD GPU support

2026-01-09 13:05:46 -03:00
parent cb17136f21
commit b017504c52
54 changed files with 7251 additions and 3670 deletions
--- a/processors/pdf_processor.py
+++ b/processors/pdf_processor.py
@@ -0,0 +1,159 @@
+"""
+PDF file processor with OCR
+"""
+import logging
+from pathlib import Path
+from typing import Dict, Any
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from ..core import FileProcessingError
+from ..config import settings
+from ..services import vram_manager
+from ..services.gpu_detector import gpu_detector
+from .base_processor import FileProcessor
+
+try:
+    import torch
+    import pytesseract
+    import easyocr
+    import cv2
+    import numpy as np
+    from pdf2image import convert_from_path
+    from PIL import Image
+    PDF_OCR_AVAILABLE = True
+except ImportError:
+    PDF_OCR_AVAILABLE = False
+
+
+class PDFProcessor(FileProcessor):
+    """Processor for PDF files with OCR"""
+
+    def __init__(self):
+        super().__init__("PDFProcessor")
+        self.logger = logging.getLogger(__name__)
+        self._easyocr_reader = None
+
+    def can_process(self, file_path: str) -> bool:
+        """Check if file is a PDF"""
+        return self.get_file_extension(file_path) == ".pdf"
+
+    def _load_easyocr(self):
+        """Load EasyOCR reader"""
+        if self._easyocr_reader is None:
+            use_gpu = gpu_detector.is_available()
+            self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})")
+            self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu)
+            vram_manager.update_usage()
+
+    def _preprocess_image(self, image: Image.Image) -> Image.Image:
+        """Preprocess image for better OCR"""
+        # Convert to grayscale
+        if image.mode != 'L':
+            image = image.convert('L')
+
+        # Simple preprocessing
+        image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS)
+
+        return image
+
+    def _run_ocr_parallel(self, pil_images) -> Dict[str, list]:
+        """Run all OCR engines in parallel"""
+        results = {
+            'easyocr': [''] * len(pil_images),
+            'tesseract': [''] * len(pil_images)
+        }
+
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            futures = {}
+
+            # EasyOCR
+            if self._easyocr_reader:
+                futures['easyocr'] = executor.submit(
+                    self._easyocr_reader.readtext_batched,
+                    pil_images,
+                    detail=0
+                )
+
+            # Tesseract
+            futures['tesseract'] = executor.submit(
+                lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs],
+                pil_images
+            )
+
+            # Collect results
+            for name, future in futures.items():
+                try:
+                    results[name] = future.result()
+                except Exception as e:
+                    self.logger.error(f"OCR engine {name} failed: {e}")
+                    results[name] = [''] * len(pil_images)
+
+        return results
+
+    def process(self, file_path: str) -> Dict[str, Any]:
+        """Process PDF with OCR"""
+        self.validate_file(file_path)
+        pdf_path = Path(file_path)
+        output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt"
+
+        if not PDF_OCR_AVAILABLE:
+            raise FileProcessingError("PDF OCR dependencies not installed")
+
+        self.logger.info(f"Processing PDF file: {pdf_path}")
+
+        try:
+            # Load EasyOCR if needed
+            self._load_easyocr()
+            vram_manager.update_usage()
+
+            # Convert PDF to images
+            self.logger.debug("Converting PDF to images")
+            pil_images = convert_from_path(
+                str(pdf_path),
+                dpi=settings.PDF_DPI,
+                fmt='png',
+                thread_count=settings.PDF_RENDER_THREAD_COUNT
+            )
+
+            # Process in batches
+            all_text = []
+            batch_size = settings.PDF_BATCH_SIZE
+
+            for i in range(0, len(pil_images), batch_size):
+                batch = pil_images[i:i + batch_size]
+                self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}")
+
+                # Preprocess images
+                preprocessed_batch = [self._preprocess_image(img) for img in batch]
+
+                # Run OCR in parallel
+                ocr_results = self._run_ocr_parallel(preprocessed_batch)
+
+                # Combine results
+                for j, img in enumerate(batch):
+                    # Take best result (simple approach: try EasyOCR first, then Tesseract)
+                    text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j]
+                    if text:
+                        all_text.append(text)
+
+            # Save combined text
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write("\n\n".join(all_text))
+
+            self.logger.info(f"PDF processing completed: {output_path}")
+
+            return {
+                "success": True,
+                "text_path": str(output_path),
+                "text": "\n\n".join(all_text),
+                "pages_processed": len(pil_images)
+            }
+
+        except Exception as e:
+            self.logger.error(f"PDF processing failed: {e}")
+            raise FileProcessingError(f"PDF processing failed: {e}")
+
+    def cleanup(self) -> None:
+        """Cleanup OCR models"""
+        self._easyocr_reader = None
+        vram_manager.cleanup()