CBCFacil v8.0 - Refactored with AMD GPU support

2026-01-09 13:05:46 -03:00
parent cb17136f21
commit b017504c52
54 changed files with 7251 additions and 3670 deletions
--- a/processors/init.py
+++ b/processors/init.py
@@ -0,0 +1,15 @@
+"""
+Processors package for CBCFacil
+"""
+
+from .base_processor import FileProcessor
+from .audio_processor import AudioProcessor
+from .pdf_processor import PDFProcessor
+from .text_processor import TextProcessor
+
+__all__ = [
+    'FileProcessor',
+    'AudioProcessor',
+    'PDFProcessor',
+    'TextProcessor'
+]
--- a/processors/audio_processor.py
+++ b/processors/audio_processor.py
@@ -0,0 +1,93 @@
+"""
+Audio file processor using Whisper
+"""
+import logging
+from pathlib import Path
+from typing import Dict, Any
+from ..core import FileProcessingError
+from ..config import settings
+from ..services import vram_manager
+from ..services.gpu_detector import gpu_detector
+from .base_processor import FileProcessor
+
+try:
+    import whisper
+    import torch
+    WHISPER_AVAILABLE = True
+except ImportError:
+    WHISPER_AVAILABLE = False
+
+
+class AudioProcessor(FileProcessor):
+    """Processor for audio files using Whisper"""
+
+    def __init__(self):
+        super().__init__("AudioProcessor")
+        self.logger = logging.getLogger(__name__)
+        self._model = None
+        self._model_name = "medium"  # Optimized for Spanish
+
+    def can_process(self, file_path: str) -> bool:
+        """Check if file is an audio file"""
+        ext = self.get_file_extension(file_path)
+        return ext in settings.AUDIO_EXTENSIONS
+
+    def _load_model(self):
+        """Load Whisper model lazily"""
+        if not WHISPER_AVAILABLE:
+            raise FileProcessingError("Whisper not installed")
+
+        if self._model is None:
+            device = gpu_detector.get_device()
+            self.logger.info(f"Loading Whisper model: {self._model_name} on {device}")
+            self._model = whisper.load_model(self._model_name, device=device)
+            vram_manager.update_usage()
+
+    def process(self, file_path: str) -> Dict[str, Any]:
+        """Transcribe audio file"""
+        self.validate_file(file_path)
+        audio_path = Path(file_path)
+        output_path = settings.LOCAL_DOWNLOADS_PATH / f"{audio_path.stem}.txt"
+
+        self.logger.info(f"Processing audio file: {audio_path}")
+
+        try:
+            # Load model if needed
+            self._load_model()
+
+            # Update VRAM usage
+            vram_manager.update_usage()
+
+            # Transcribe with torch.no_grad() for memory efficiency
+            with torch.inference_mode():
+                result = self._model.transcribe(
+                    str(audio_path),
+                    language="es",
+                    fp16=True,
+                    verbose=False
+                )
+
+            # Save transcription
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(result["text"])
+
+            self.logger.info(f"Transcription completed: {output_path}")
+
+            return {
+                "success": True,
+                "transcription_path": str(output_path),
+                "text": result["text"],
+                "model_used": self._model_name
+            }
+
+        except Exception as e:
+            self.logger.error(f"Audio processing failed: {e}")
+            raise FileProcessingError(f"Audio processing failed: {e}")
+
+    def cleanup(self) -> None:
+        """Cleanup model"""
+        if self._model is not None:
+            del self._model
+            self._model = None
+            vram_manager.cleanup()
--- a/processors/base_processor.py
+++ b/processors/base_processor.py
@@ -0,0 +1,40 @@
+"""
+Base File Processor (Strategy Pattern)
+"""
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Dict, Any, Optional
+from ..core import FileProcessingError
+
+
+class FileProcessor(ABC):
+    """Abstract base class for file processors"""
+
+    def __init__(self, name: str):
+        self.name = name
+
+    @abstractmethod
+    def can_process(self, file_path: str) -> bool:
+        """Check if processor can handle this file type"""
+        pass
+
+    @abstractmethod
+    def process(self, file_path: str) -> Dict[str, Any]:
+        """Process the file"""
+        pass
+
+    def get_file_extension(self, file_path: str) -> str:
+        """Get file extension from path"""
+        return Path(file_path).suffix.lower()
+
+    def get_base_name(self, file_path: str) -> str:
+        """Get base name without extension"""
+        return Path(file_path).stem
+
+    def validate_file(self, file_path: str) -> None:
+        """Validate file exists and is accessible"""
+        path = Path(file_path)
+        if not path.exists():
+            raise FileProcessingError(f"File not found: {file_path}")
+        if not path.is_file():
+            raise FileProcessingError(f"Path is not a file: {file_path}")
--- a/processors/pdf_processor.py
+++ b/processors/pdf_processor.py
@@ -0,0 +1,159 @@
+"""
+PDF file processor with OCR
+"""
+import logging
+from pathlib import Path
+from typing import Dict, Any
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from ..core import FileProcessingError
+from ..config import settings
+from ..services import vram_manager
+from ..services.gpu_detector import gpu_detector
+from .base_processor import FileProcessor
+
+try:
+    import torch
+    import pytesseract
+    import easyocr
+    import cv2
+    import numpy as np
+    from pdf2image import convert_from_path
+    from PIL import Image
+    PDF_OCR_AVAILABLE = True
+except ImportError:
+    PDF_OCR_AVAILABLE = False
+
+
+class PDFProcessor(FileProcessor):
+    """Processor for PDF files with OCR"""
+
+    def __init__(self):
+        super().__init__("PDFProcessor")
+        self.logger = logging.getLogger(__name__)
+        self._easyocr_reader = None
+
+    def can_process(self, file_path: str) -> bool:
+        """Check if file is a PDF"""
+        return self.get_file_extension(file_path) == ".pdf"
+
+    def _load_easyocr(self):
+        """Load EasyOCR reader"""
+        if self._easyocr_reader is None:
+            use_gpu = gpu_detector.is_available()
+            self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})")
+            self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu)
+            vram_manager.update_usage()
+
+    def _preprocess_image(self, image: Image.Image) -> Image.Image:
+        """Preprocess image for better OCR"""
+        # Convert to grayscale
+        if image.mode != 'L':
+            image = image.convert('L')
+
+        # Simple preprocessing
+        image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS)
+
+        return image
+
+    def _run_ocr_parallel(self, pil_images) -> Dict[str, list]:
+        """Run all OCR engines in parallel"""
+        results = {
+            'easyocr': [''] * len(pil_images),
+            'tesseract': [''] * len(pil_images)
+        }
+
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            futures = {}
+
+            # EasyOCR
+            if self._easyocr_reader:
+                futures['easyocr'] = executor.submit(
+                    self._easyocr_reader.readtext_batched,
+                    pil_images,
+                    detail=0
+                )
+
+            # Tesseract
+            futures['tesseract'] = executor.submit(
+                lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs],
+                pil_images
+            )
+
+            # Collect results
+            for name, future in futures.items():
+                try:
+                    results[name] = future.result()
+                except Exception as e:
+                    self.logger.error(f"OCR engine {name} failed: {e}")
+                    results[name] = [''] * len(pil_images)
+
+        return results
+
+    def process(self, file_path: str) -> Dict[str, Any]:
+        """Process PDF with OCR"""
+        self.validate_file(file_path)
+        pdf_path = Path(file_path)
+        output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt"
+
+        if not PDF_OCR_AVAILABLE:
+            raise FileProcessingError("PDF OCR dependencies not installed")
+
+        self.logger.info(f"Processing PDF file: {pdf_path}")
+
+        try:
+            # Load EasyOCR if needed
+            self._load_easyocr()
+            vram_manager.update_usage()
+
+            # Convert PDF to images
+            self.logger.debug("Converting PDF to images")
+            pil_images = convert_from_path(
+                str(pdf_path),
+                dpi=settings.PDF_DPI,
+                fmt='png',
+                thread_count=settings.PDF_RENDER_THREAD_COUNT
+            )
+
+            # Process in batches
+            all_text = []
+            batch_size = settings.PDF_BATCH_SIZE
+
+            for i in range(0, len(pil_images), batch_size):
+                batch = pil_images[i:i + batch_size]
+                self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}")
+
+                # Preprocess images
+                preprocessed_batch = [self._preprocess_image(img) for img in batch]
+
+                # Run OCR in parallel
+                ocr_results = self._run_ocr_parallel(preprocessed_batch)
+
+                # Combine results
+                for j, img in enumerate(batch):
+                    # Take best result (simple approach: try EasyOCR first, then Tesseract)
+                    text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j]
+                    if text:
+                        all_text.append(text)
+
+            # Save combined text
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write("\n\n".join(all_text))
+
+            self.logger.info(f"PDF processing completed: {output_path}")
+
+            return {
+                "success": True,
+                "text_path": str(output_path),
+                "text": "\n\n".join(all_text),
+                "pages_processed": len(pil_images)
+            }
+
+        except Exception as e:
+            self.logger.error(f"PDF processing failed: {e}")
+            raise FileProcessingError(f"PDF processing failed: {e}")
+
+    def cleanup(self) -> None:
+        """Cleanup OCR models"""
+        self._easyocr_reader = None
+        vram_manager.cleanup()
--- a/processors/text_processor.py
+++ b/processors/text_processor.py
@@ -0,0 +1,55 @@
+"""
+Text file processor
+"""
+import logging
+from pathlib import Path
+from typing import Dict, Any
+from ..core import FileProcessingError
+from ..config import settings
+from .base_processor import FileProcessor
+
+
+class TextProcessor(FileProcessor):
+    """Processor for text files"""
+
+    def __init__(self):
+        super().__init__("TextProcessor")
+        self.logger = logging.getLogger(__name__)
+
+    def can_process(self, file_path: str) -> bool:
+        """Check if file is a text file"""
+        ext = self.get_file_extension(file_path)
+        return ext in settings.TXT_EXTENSIONS
+
+    def process(self, file_path: str) -> Dict[str, Any]:
+        """Process text file (copy to downloads)"""
+        self.validate_file(file_path)
+        text_path = Path(file_path)
+        output_path = settings.LOCAL_DOWNLOADS_PATH / text_path.name
+
+        self.logger.info(f"Processing text file: {text_path}")
+
+        try:
+            # Copy file to downloads directory
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+
+            with open(text_path, 'r', encoding='utf-8') as src:
+                with open(output_path, 'w', encoding='utf-8') as dst:
+                    dst.write(src.read())
+
+            self.logger.info(f"Text file processing completed: {output_path}")
+
+            return {
+                "success": True,
+                "text_path": str(output_path),
+                "text": self._read_file(output_path)
+            }
+
+        except Exception as e:
+            self.logger.error(f"Text processing failed: {e}")
+            raise FileProcessingError(f"Text processing failed: {e}")
+
+    def _read_file(self, file_path: Path) -> str:
+        """Read file content"""
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()