165 lines
5.4 KiB
Python
165 lines
5.4 KiB
Python
"""
|
|
PDF file processor with OCR
|
|
"""
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, Any
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from core import FileProcessingError
|
|
from config import settings
|
|
from services import vram_manager
|
|
from services.gpu_detector import gpu_detector
|
|
from .base_processor import FileProcessor
|
|
|
|
try:
|
|
import torch
|
|
import pytesseract
|
|
import easyocr
|
|
import cv2
|
|
import numpy as np
|
|
from pdf2image import convert_from_path
|
|
from PIL import Image
|
|
PDF_OCR_AVAILABLE = True
|
|
except ImportError:
|
|
PDF_OCR_AVAILABLE = False
|
|
# Provide stub for type hints
|
|
try:
|
|
from PIL import Image
|
|
except ImportError:
|
|
Image = None # type: ignore
|
|
|
|
|
|
class PDFProcessor(FileProcessor):
|
|
"""Processor for PDF files with OCR"""
|
|
|
|
def __init__(self):
|
|
super().__init__("PDFProcessor")
|
|
self.logger = logging.getLogger(__name__)
|
|
self._easyocr_reader = None
|
|
|
|
def can_process(self, file_path: str) -> bool:
|
|
"""Check if file is a PDF"""
|
|
return self.get_file_extension(file_path) == ".pdf"
|
|
|
|
def _load_easyocr(self):
|
|
"""Load EasyOCR reader"""
|
|
if self._easyocr_reader is None:
|
|
use_gpu = gpu_detector.is_available()
|
|
self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})")
|
|
self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu)
|
|
vram_manager.update_usage()
|
|
|
|
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
|
"""Preprocess image for better OCR"""
|
|
# Convert to grayscale
|
|
if image.mode != 'L':
|
|
image = image.convert('L')
|
|
|
|
# Simple preprocessing
|
|
image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS)
|
|
|
|
return image
|
|
|
|
def _run_ocr_parallel(self, pil_images) -> Dict[str, list]:
|
|
"""Run all OCR engines in parallel"""
|
|
results = {
|
|
'easyocr': [''] * len(pil_images),
|
|
'tesseract': [''] * len(pil_images)
|
|
}
|
|
|
|
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
futures = {}
|
|
|
|
# EasyOCR
|
|
if self._easyocr_reader:
|
|
futures['easyocr'] = executor.submit(
|
|
self._easyocr_reader.readtext_batched,
|
|
pil_images,
|
|
detail=0
|
|
)
|
|
|
|
# Tesseract
|
|
futures['tesseract'] = executor.submit(
|
|
lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs],
|
|
pil_images
|
|
)
|
|
|
|
# Collect results
|
|
for name, future in futures.items():
|
|
try:
|
|
results[name] = future.result()
|
|
except Exception as e:
|
|
self.logger.error(f"OCR engine {name} failed: {e}")
|
|
results[name] = [''] * len(pil_images)
|
|
|
|
return results
|
|
|
|
def process(self, file_path: str) -> Dict[str, Any]:
|
|
"""Process PDF with OCR"""
|
|
self.validate_file(file_path)
|
|
pdf_path = Path(file_path)
|
|
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt"
|
|
|
|
if not PDF_OCR_AVAILABLE:
|
|
raise FileProcessingError("PDF OCR dependencies not installed")
|
|
|
|
self.logger.info(f"Processing PDF file: {pdf_path}")
|
|
|
|
try:
|
|
# Load EasyOCR if needed
|
|
self._load_easyocr()
|
|
vram_manager.update_usage()
|
|
|
|
# Convert PDF to images
|
|
self.logger.debug("Converting PDF to images")
|
|
pil_images = convert_from_path(
|
|
str(pdf_path),
|
|
dpi=settings.PDF_DPI,
|
|
fmt='png',
|
|
thread_count=settings.PDF_RENDER_THREAD_COUNT
|
|
)
|
|
|
|
# Process in batches
|
|
all_text = []
|
|
batch_size = settings.PDF_BATCH_SIZE
|
|
|
|
for i in range(0, len(pil_images), batch_size):
|
|
batch = pil_images[i:i + batch_size]
|
|
self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}")
|
|
|
|
# Preprocess images
|
|
preprocessed_batch = [self._preprocess_image(img) for img in batch]
|
|
|
|
# Run OCR in parallel
|
|
ocr_results = self._run_ocr_parallel(preprocessed_batch)
|
|
|
|
# Combine results
|
|
for j, img in enumerate(batch):
|
|
# Take best result (simple approach: try EasyOCR first, then Tesseract)
|
|
text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j]
|
|
if text:
|
|
all_text.append(text)
|
|
|
|
# Save combined text
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write("\n\n".join(all_text))
|
|
|
|
self.logger.info(f"PDF processing completed: {output_path}")
|
|
|
|
return {
|
|
"success": True,
|
|
"text_path": str(output_path),
|
|
"text": "\n\n".join(all_text),
|
|
"pages_processed": len(pil_images)
|
|
}
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"PDF processing failed: {e}")
|
|
raise FileProcessingError(f"PDF processing failed: {e}")
|
|
|
|
def cleanup(self) -> None:
|
|
"""Cleanup OCR models"""
|
|
self._easyocr_reader = None
|
|
vram_manager.cleanup()
|