Files
cbc2027/processors/pdf_processor.py

165 lines
5.4 KiB
Python

"""
PDF file processor with OCR
"""
import logging
from pathlib import Path
from typing import Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from core import FileProcessingError
from config import settings
from services import vram_manager
from services.gpu_detector import gpu_detector
from .base_processor import FileProcessor
try:
import torch
import pytesseract
import easyocr
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image
PDF_OCR_AVAILABLE = True
except ImportError:
PDF_OCR_AVAILABLE = False
# Provide stub for type hints
try:
from PIL import Image
except ImportError:
Image = None # type: ignore
class PDFProcessor(FileProcessor):
"""Processor for PDF files with OCR"""
def __init__(self):
super().__init__("PDFProcessor")
self.logger = logging.getLogger(__name__)
self._easyocr_reader = None
def can_process(self, file_path: str) -> bool:
"""Check if file is a PDF"""
return self.get_file_extension(file_path) == ".pdf"
def _load_easyocr(self):
"""Load EasyOCR reader"""
if self._easyocr_reader is None:
use_gpu = gpu_detector.is_available()
self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})")
self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu)
vram_manager.update_usage()
def _preprocess_image(self, image: Image.Image) -> Image.Image:
"""Preprocess image for better OCR"""
# Convert to grayscale
if image.mode != 'L':
image = image.convert('L')
# Simple preprocessing
image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS)
return image
def _run_ocr_parallel(self, pil_images) -> Dict[str, list]:
"""Run all OCR engines in parallel"""
results = {
'easyocr': [''] * len(pil_images),
'tesseract': [''] * len(pil_images)
}
with ThreadPoolExecutor(max_workers=2) as executor:
futures = {}
# EasyOCR
if self._easyocr_reader:
futures['easyocr'] = executor.submit(
self._easyocr_reader.readtext_batched,
pil_images,
detail=0
)
# Tesseract
futures['tesseract'] = executor.submit(
lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs],
pil_images
)
# Collect results
for name, future in futures.items():
try:
results[name] = future.result()
except Exception as e:
self.logger.error(f"OCR engine {name} failed: {e}")
results[name] = [''] * len(pil_images)
return results
def process(self, file_path: str) -> Dict[str, Any]:
"""Process PDF with OCR"""
self.validate_file(file_path)
pdf_path = Path(file_path)
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt"
if not PDF_OCR_AVAILABLE:
raise FileProcessingError("PDF OCR dependencies not installed")
self.logger.info(f"Processing PDF file: {pdf_path}")
try:
# Load EasyOCR if needed
self._load_easyocr()
vram_manager.update_usage()
# Convert PDF to images
self.logger.debug("Converting PDF to images")
pil_images = convert_from_path(
str(pdf_path),
dpi=settings.PDF_DPI,
fmt='png',
thread_count=settings.PDF_RENDER_THREAD_COUNT
)
# Process in batches
all_text = []
batch_size = settings.PDF_BATCH_SIZE
for i in range(0, len(pil_images), batch_size):
batch = pil_images[i:i + batch_size]
self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}")
# Preprocess images
preprocessed_batch = [self._preprocess_image(img) for img in batch]
# Run OCR in parallel
ocr_results = self._run_ocr_parallel(preprocessed_batch)
# Combine results
for j, img in enumerate(batch):
# Take best result (simple approach: try EasyOCR first, then Tesseract)
text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j]
if text:
all_text.append(text)
# Save combined text
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write("\n\n".join(all_text))
self.logger.info(f"PDF processing completed: {output_path}")
return {
"success": True,
"text_path": str(output_path),
"text": "\n\n".join(all_text),
"pages_processed": len(pil_images)
}
except Exception as e:
self.logger.error(f"PDF processing failed: {e}")
raise FileProcessingError(f"PDF processing failed: {e}")
def cleanup(self) -> None:
"""Cleanup OCR models"""
self._easyocr_reader = None
vram_manager.cleanup()