CBCFacil v8.0 - Refactored with AMD GPU support
This commit is contained in:
15
processors/__init__.py
Normal file
15
processors/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
Processors package for CBCFacil
|
||||
"""
|
||||
|
||||
from .base_processor import FileProcessor
|
||||
from .audio_processor import AudioProcessor
|
||||
from .pdf_processor import PDFProcessor
|
||||
from .text_processor import TextProcessor
|
||||
|
||||
__all__ = [
|
||||
'FileProcessor',
|
||||
'AudioProcessor',
|
||||
'PDFProcessor',
|
||||
'TextProcessor'
|
||||
]
|
||||
93
processors/audio_processor.py
Normal file
93
processors/audio_processor.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""
|
||||
Audio file processor using Whisper
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from ..core import FileProcessingError
|
||||
from ..config import settings
|
||||
from ..services import vram_manager
|
||||
from ..services.gpu_detector import gpu_detector
|
||||
from .base_processor import FileProcessor
|
||||
|
||||
try:
|
||||
import whisper
|
||||
import torch
|
||||
WHISPER_AVAILABLE = True
|
||||
except ImportError:
|
||||
WHISPER_AVAILABLE = False
|
||||
|
||||
|
||||
class AudioProcessor(FileProcessor):
|
||||
"""Processor for audio files using Whisper"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("AudioProcessor")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._model = None
|
||||
self._model_name = "medium" # Optimized for Spanish
|
||||
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if file is an audio file"""
|
||||
ext = self.get_file_extension(file_path)
|
||||
return ext in settings.AUDIO_EXTENSIONS
|
||||
|
||||
def _load_model(self):
|
||||
"""Load Whisper model lazily"""
|
||||
if not WHISPER_AVAILABLE:
|
||||
raise FileProcessingError("Whisper not installed")
|
||||
|
||||
if self._model is None:
|
||||
device = gpu_detector.get_device()
|
||||
self.logger.info(f"Loading Whisper model: {self._model_name} on {device}")
|
||||
self._model = whisper.load_model(self._model_name, device=device)
|
||||
vram_manager.update_usage()
|
||||
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Transcribe audio file"""
|
||||
self.validate_file(file_path)
|
||||
audio_path = Path(file_path)
|
||||
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{audio_path.stem}.txt"
|
||||
|
||||
self.logger.info(f"Processing audio file: {audio_path}")
|
||||
|
||||
try:
|
||||
# Load model if needed
|
||||
self._load_model()
|
||||
|
||||
# Update VRAM usage
|
||||
vram_manager.update_usage()
|
||||
|
||||
# Transcribe with torch.no_grad() for memory efficiency
|
||||
with torch.inference_mode():
|
||||
result = self._model.transcribe(
|
||||
str(audio_path),
|
||||
language="es",
|
||||
fp16=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Save transcription
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(result["text"])
|
||||
|
||||
self.logger.info(f"Transcription completed: {output_path}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"transcription_path": str(output_path),
|
||||
"text": result["text"],
|
||||
"model_used": self._model_name
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Audio processing failed: {e}")
|
||||
raise FileProcessingError(f"Audio processing failed: {e}")
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup model"""
|
||||
if self._model is not None:
|
||||
del self._model
|
||||
self._model = None
|
||||
vram_manager.cleanup()
|
||||
40
processors/base_processor.py
Normal file
40
processors/base_processor.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""
|
||||
Base File Processor (Strategy Pattern)
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from ..core import FileProcessingError
|
||||
|
||||
|
||||
class FileProcessor(ABC):
|
||||
"""Abstract base class for file processors"""
|
||||
|
||||
def __init__(self, name: str):
|
||||
self.name = name
|
||||
|
||||
@abstractmethod
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if processor can handle this file type"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Process the file"""
|
||||
pass
|
||||
|
||||
def get_file_extension(self, file_path: str) -> str:
|
||||
"""Get file extension from path"""
|
||||
return Path(file_path).suffix.lower()
|
||||
|
||||
def get_base_name(self, file_path: str) -> str:
|
||||
"""Get base name without extension"""
|
||||
return Path(file_path).stem
|
||||
|
||||
def validate_file(self, file_path: str) -> None:
|
||||
"""Validate file exists and is accessible"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise FileProcessingError(f"File not found: {file_path}")
|
||||
if not path.is_file():
|
||||
raise FileProcessingError(f"Path is not a file: {file_path}")
|
||||
159
processors/pdf_processor.py
Normal file
159
processors/pdf_processor.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""
|
||||
PDF file processor with OCR
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from ..core import FileProcessingError
|
||||
from ..config import settings
|
||||
from ..services import vram_manager
|
||||
from ..services.gpu_detector import gpu_detector
|
||||
from .base_processor import FileProcessor
|
||||
|
||||
try:
|
||||
import torch
|
||||
import pytesseract
|
||||
import easyocr
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pdf2image import convert_from_path
|
||||
from PIL import Image
|
||||
PDF_OCR_AVAILABLE = True
|
||||
except ImportError:
|
||||
PDF_OCR_AVAILABLE = False
|
||||
|
||||
|
||||
class PDFProcessor(FileProcessor):
|
||||
"""Processor for PDF files with OCR"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("PDFProcessor")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._easyocr_reader = None
|
||||
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if file is a PDF"""
|
||||
return self.get_file_extension(file_path) == ".pdf"
|
||||
|
||||
def _load_easyocr(self):
|
||||
"""Load EasyOCR reader"""
|
||||
if self._easyocr_reader is None:
|
||||
use_gpu = gpu_detector.is_available()
|
||||
self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})")
|
||||
self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu)
|
||||
vram_manager.update_usage()
|
||||
|
||||
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
||||
"""Preprocess image for better OCR"""
|
||||
# Convert to grayscale
|
||||
if image.mode != 'L':
|
||||
image = image.convert('L')
|
||||
|
||||
# Simple preprocessing
|
||||
image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS)
|
||||
|
||||
return image
|
||||
|
||||
def _run_ocr_parallel(self, pil_images) -> Dict[str, list]:
|
||||
"""Run all OCR engines in parallel"""
|
||||
results = {
|
||||
'easyocr': [''] * len(pil_images),
|
||||
'tesseract': [''] * len(pil_images)
|
||||
}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2) as executor:
|
||||
futures = {}
|
||||
|
||||
# EasyOCR
|
||||
if self._easyocr_reader:
|
||||
futures['easyocr'] = executor.submit(
|
||||
self._easyocr_reader.readtext_batched,
|
||||
pil_images,
|
||||
detail=0
|
||||
)
|
||||
|
||||
# Tesseract
|
||||
futures['tesseract'] = executor.submit(
|
||||
lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs],
|
||||
pil_images
|
||||
)
|
||||
|
||||
# Collect results
|
||||
for name, future in futures.items():
|
||||
try:
|
||||
results[name] = future.result()
|
||||
except Exception as e:
|
||||
self.logger.error(f"OCR engine {name} failed: {e}")
|
||||
results[name] = [''] * len(pil_images)
|
||||
|
||||
return results
|
||||
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Process PDF with OCR"""
|
||||
self.validate_file(file_path)
|
||||
pdf_path = Path(file_path)
|
||||
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt"
|
||||
|
||||
if not PDF_OCR_AVAILABLE:
|
||||
raise FileProcessingError("PDF OCR dependencies not installed")
|
||||
|
||||
self.logger.info(f"Processing PDF file: {pdf_path}")
|
||||
|
||||
try:
|
||||
# Load EasyOCR if needed
|
||||
self._load_easyocr()
|
||||
vram_manager.update_usage()
|
||||
|
||||
# Convert PDF to images
|
||||
self.logger.debug("Converting PDF to images")
|
||||
pil_images = convert_from_path(
|
||||
str(pdf_path),
|
||||
dpi=settings.PDF_DPI,
|
||||
fmt='png',
|
||||
thread_count=settings.PDF_RENDER_THREAD_COUNT
|
||||
)
|
||||
|
||||
# Process in batches
|
||||
all_text = []
|
||||
batch_size = settings.PDF_BATCH_SIZE
|
||||
|
||||
for i in range(0, len(pil_images), batch_size):
|
||||
batch = pil_images[i:i + batch_size]
|
||||
self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}")
|
||||
|
||||
# Preprocess images
|
||||
preprocessed_batch = [self._preprocess_image(img) for img in batch]
|
||||
|
||||
# Run OCR in parallel
|
||||
ocr_results = self._run_ocr_parallel(preprocessed_batch)
|
||||
|
||||
# Combine results
|
||||
for j, img in enumerate(batch):
|
||||
# Take best result (simple approach: try EasyOCR first, then Tesseract)
|
||||
text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j]
|
||||
if text:
|
||||
all_text.append(text)
|
||||
|
||||
# Save combined text
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write("\n\n".join(all_text))
|
||||
|
||||
self.logger.info(f"PDF processing completed: {output_path}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text_path": str(output_path),
|
||||
"text": "\n\n".join(all_text),
|
||||
"pages_processed": len(pil_images)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"PDF processing failed: {e}")
|
||||
raise FileProcessingError(f"PDF processing failed: {e}")
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup OCR models"""
|
||||
self._easyocr_reader = None
|
||||
vram_manager.cleanup()
|
||||
55
processors/text_processor.py
Normal file
55
processors/text_processor.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""
|
||||
Text file processor
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from ..core import FileProcessingError
|
||||
from ..config import settings
|
||||
from .base_processor import FileProcessor
|
||||
|
||||
|
||||
class TextProcessor(FileProcessor):
|
||||
"""Processor for text files"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("TextProcessor")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if file is a text file"""
|
||||
ext = self.get_file_extension(file_path)
|
||||
return ext in settings.TXT_EXTENSIONS
|
||||
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Process text file (copy to downloads)"""
|
||||
self.validate_file(file_path)
|
||||
text_path = Path(file_path)
|
||||
output_path = settings.LOCAL_DOWNLOADS_PATH / text_path.name
|
||||
|
||||
self.logger.info(f"Processing text file: {text_path}")
|
||||
|
||||
try:
|
||||
# Copy file to downloads directory
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(text_path, 'r', encoding='utf-8') as src:
|
||||
with open(output_path, 'w', encoding='utf-8') as dst:
|
||||
dst.write(src.read())
|
||||
|
||||
self.logger.info(f"Text file processing completed: {output_path}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text_path": str(output_path),
|
||||
"text": self._read_file(output_path)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Text processing failed: {e}")
|
||||
raise FileProcessingError(f"Text processing failed: {e}")
|
||||
|
||||
def _read_file(self, file_path: Path) -> str:
|
||||
"""Read file content"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
Reference in New Issue
Block a user