CBCFacil v8.0 - Refactored with AMD GPU support
This commit is contained in:
159
processors/pdf_processor.py
Normal file
159
processors/pdf_processor.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""
|
||||
PDF file processor with OCR
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from ..core import FileProcessingError
|
||||
from ..config import settings
|
||||
from ..services import vram_manager
|
||||
from ..services.gpu_detector import gpu_detector
|
||||
from .base_processor import FileProcessor
|
||||
|
||||
try:
|
||||
import torch
|
||||
import pytesseract
|
||||
import easyocr
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pdf2image import convert_from_path
|
||||
from PIL import Image
|
||||
PDF_OCR_AVAILABLE = True
|
||||
except ImportError:
|
||||
PDF_OCR_AVAILABLE = False
|
||||
|
||||
|
||||
class PDFProcessor(FileProcessor):
|
||||
"""Processor for PDF files with OCR"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("PDFProcessor")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._easyocr_reader = None
|
||||
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if file is a PDF"""
|
||||
return self.get_file_extension(file_path) == ".pdf"
|
||||
|
||||
def _load_easyocr(self):
|
||||
"""Load EasyOCR reader"""
|
||||
if self._easyocr_reader is None:
|
||||
use_gpu = gpu_detector.is_available()
|
||||
self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})")
|
||||
self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu)
|
||||
vram_manager.update_usage()
|
||||
|
||||
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
||||
"""Preprocess image for better OCR"""
|
||||
# Convert to grayscale
|
||||
if image.mode != 'L':
|
||||
image = image.convert('L')
|
||||
|
||||
# Simple preprocessing
|
||||
image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS)
|
||||
|
||||
return image
|
||||
|
||||
def _run_ocr_parallel(self, pil_images) -> Dict[str, list]:
|
||||
"""Run all OCR engines in parallel"""
|
||||
results = {
|
||||
'easyocr': [''] * len(pil_images),
|
||||
'tesseract': [''] * len(pil_images)
|
||||
}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2) as executor:
|
||||
futures = {}
|
||||
|
||||
# EasyOCR
|
||||
if self._easyocr_reader:
|
||||
futures['easyocr'] = executor.submit(
|
||||
self._easyocr_reader.readtext_batched,
|
||||
pil_images,
|
||||
detail=0
|
||||
)
|
||||
|
||||
# Tesseract
|
||||
futures['tesseract'] = executor.submit(
|
||||
lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs],
|
||||
pil_images
|
||||
)
|
||||
|
||||
# Collect results
|
||||
for name, future in futures.items():
|
||||
try:
|
||||
results[name] = future.result()
|
||||
except Exception as e:
|
||||
self.logger.error(f"OCR engine {name} failed: {e}")
|
||||
results[name] = [''] * len(pil_images)
|
||||
|
||||
return results
|
||||
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Process PDF with OCR"""
|
||||
self.validate_file(file_path)
|
||||
pdf_path = Path(file_path)
|
||||
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt"
|
||||
|
||||
if not PDF_OCR_AVAILABLE:
|
||||
raise FileProcessingError("PDF OCR dependencies not installed")
|
||||
|
||||
self.logger.info(f"Processing PDF file: {pdf_path}")
|
||||
|
||||
try:
|
||||
# Load EasyOCR if needed
|
||||
self._load_easyocr()
|
||||
vram_manager.update_usage()
|
||||
|
||||
# Convert PDF to images
|
||||
self.logger.debug("Converting PDF to images")
|
||||
pil_images = convert_from_path(
|
||||
str(pdf_path),
|
||||
dpi=settings.PDF_DPI,
|
||||
fmt='png',
|
||||
thread_count=settings.PDF_RENDER_THREAD_COUNT
|
||||
)
|
||||
|
||||
# Process in batches
|
||||
all_text = []
|
||||
batch_size = settings.PDF_BATCH_SIZE
|
||||
|
||||
for i in range(0, len(pil_images), batch_size):
|
||||
batch = pil_images[i:i + batch_size]
|
||||
self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}")
|
||||
|
||||
# Preprocess images
|
||||
preprocessed_batch = [self._preprocess_image(img) for img in batch]
|
||||
|
||||
# Run OCR in parallel
|
||||
ocr_results = self._run_ocr_parallel(preprocessed_batch)
|
||||
|
||||
# Combine results
|
||||
for j, img in enumerate(batch):
|
||||
# Take best result (simple approach: try EasyOCR first, then Tesseract)
|
||||
text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j]
|
||||
if text:
|
||||
all_text.append(text)
|
||||
|
||||
# Save combined text
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write("\n\n".join(all_text))
|
||||
|
||||
self.logger.info(f"PDF processing completed: {output_path}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text_path": str(output_path),
|
||||
"text": "\n\n".join(all_text),
|
||||
"pages_processed": len(pil_images)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"PDF processing failed: {e}")
|
||||
raise FileProcessingError(f"PDF processing failed: {e}")
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup OCR models"""
|
||||
self._easyocr_reader = None
|
||||
vram_manager.cleanup()
|
||||
Reference in New Issue
Block a user