CBCFacil v8.0 - Refactored with AMD GPU support

2026-01-09 13:05:46 -03:00
parent cb17136f21
commit b017504c52
54 changed files with 7251 additions and 3670 deletions
--- a/backup/originals_20250109/ai_service.py
+++ b/backup/originals_20250109/ai_service.py
@@ -0,0 +1,69 @@
+"""
+AI Service - Unified interface for AI providers
+"""
+import logging
+from typing import Optional, Dict, Any
+
+from .config import settings
+from .core import AIProcessingError
+from .services.ai.provider_factory import AIProviderFactory, ai_provider_factory
+
+
+class AIService:
+    """Unified service for AI operations with provider fallback"""
+
+    def __init__(self):
+        self.logger = logging.getLogger(__name__)
+        self._factory: Optional[AIProviderFactory] = None
+
+    @property
+    def factory(self) -> AIProviderFactory:
+        """Lazy initialization of provider factory"""
+        if self._factory is None:
+            self._factory = ai_provider_factory
+        return self._factory
+
+    def generate_text(
+        self,
+        prompt: str,
+        provider: Optional[str] = None,
+        max_tokens: int = 4096
+    ) -> str:
+        """Generate text using AI provider"""
+        try:
+            ai_provider = self.factory.get_provider(provider or 'gemini')
+            return ai_provider.generate(prompt, max_tokens=max_tokens)
+        except AIProcessingError as e:
+            self.logger.error(f"AI generation failed: {e}")
+            return f"Error: {str(e)}"
+
+    def summarize(self, text: str, **kwargs) -> str:
+        """Generate summary of text"""
+        try:
+            provider = self.factory.get_best_provider()
+            return provider.summarize(text, **kwargs)
+        except AIProcessingError as e:
+            self.logger.error(f"Summarization failed: {e}")
+            return f"Error: {str(e)}"
+
+    def correct_text(self, text: str, **kwargs) -> str:
+        """Correct grammar and spelling in text"""
+        try:
+            provider = self.factory.get_best_provider()
+            return provider.correct_text(text, **kwargs)
+        except AIProcessingError as e:
+            self.logger.error(f"Text correction failed: {e}")
+            return text  # Return original on error
+
+    def classify_content(self, text: str, **kwargs) -> Dict[str, Any]:
+        """Classify content into categories"""
+        try:
+            provider = self.factory.get_best_provider()
+            return provider.classify_content(text, **kwargs)
+        except AIProcessingError as e:
+            self.logger.error(f"Classification failed: {e}")
+            return {"category": "otras_clases", "confidence": 0.0}
+
+
+# Global instance
+ai_service = AIService()
--- a/backup/originals_20250109/gemini_provider.py
+++ b/backup/originals_20250109/gemini_provider.py
@@ -0,0 +1,171 @@
+"""
+Gemini AI Provider implementation
+"""
+import logging
+import subprocess
+import shutil
+import requests
+import time
+from typing import Dict, Any, Optional
+
+from ..config import settings
+from ..core import AIProcessingError
+from .base_provider import AIProvider
+
+
+class GeminiProvider(AIProvider):
+    """Gemini AI provider using CLI or API"""
+
+    def __init__(self):
+        self.logger = logging.getLogger(__name__)
+        self._cli_path = settings.GEMINI_CLI_PATH or shutil.which("gemini")
+        self._api_key = settings.GEMINI_API_KEY
+        self._flash_model = settings.GEMINI_FLASH_MODEL
+        self._pro_model = settings.GEMINI_PRO_MODEL
+        self._session = None
+
+    @property
+    def name(self) -> str:
+        return "Gemini"
+
+    def is_available(self) -> bool:
+        """Check if Gemini is available"""
+        return bool(self._cli_path or self._api_key)
+
+    def _run_cli(self, prompt: str, use_flash: bool = True, timeout: int = 300) -> str:
+        """Run Gemini CLI with prompt"""
+        if not self._cli_path:
+            raise AIProcessingError("Gemini CLI not available")
+
+        model = self._flash_model if use_flash else self._pro_model
+        cmd = [self._cli_path, model, prompt]
+
+        try:
+            process = subprocess.run(
+                cmd,
+                text=True,
+                capture_output=True,
+                timeout=timeout,
+                shell=False
+            )
+
+            if process.returncode != 0:
+                error_msg = process.stderr or "Unknown error"
+                raise AIProcessingError(f"Gemini CLI failed: {error_msg}")
+
+            return process.stdout.strip()
+        except subprocess.TimeoutExpired:
+            raise AIProcessingError(f"Gemini CLI timed out after {timeout}s")
+        except Exception as e:
+            raise AIProcessingError(f"Gemini CLI error: {e}")
+
+    def _call_api(self, prompt: str, use_flash: bool = True, timeout: int = 180) -> str:
+        """Call Gemini API"""
+        if not self._api_key:
+            raise AIProcessingError("Gemini API key not configured")
+
+        model = self._flash_model if use_flash else self._pro_model
+
+        # Initialize session if needed
+        if self._session is None:
+            self._session = requests.Session()
+            adapter = requests.adapters.HTTPAdapter(
+                pool_connections=10,
+                pool_maxsize=20
+            )
+            self._session.mount('https://', adapter)
+
+        url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
+
+        payload = {
+            "contents": [{
+                "parts": [{"text": prompt}]
+            }]
+        }
+
+        params = {"key": self._api_key}
+
+        try:
+            response = self._session.post(
+                url,
+                json=payload,
+                params=params,
+                timeout=timeout
+            )
+            response.raise_for_status()
+
+            data = response.json()
+
+            if "candidates" not in data or not data["candidates"]:
+                raise AIProcessingError("Empty response from Gemini API")
+
+            candidate = data["candidates"][0]
+            if "content" not in candidate or "parts" not in candidate["content"]:
+                raise AIProcessingError("Invalid response format from Gemini API")
+
+            result = candidate["content"]["parts"][0]["text"]
+            return result.strip()
+
+        except requests.RequestException as e:
+            raise AIProcessingError(f"Gemini API request failed: {e}")
+        except (KeyError, IndexError, ValueError) as e:
+            raise AIProcessingError(f"Gemini API response error: {e}")
+
+    def _run(self, prompt: str, use_flash: bool = True, timeout: int = 300) -> str:
+        """Run Gemini with fallback between CLI and API"""
+        # Try CLI first if available
+        if self._cli_path:
+            try:
+                return self._run_cli(prompt, use_flash, timeout)
+            except Exception as e:
+                self.logger.warning(f"Gemini CLI failed, trying API: {e}")
+
+        # Fallback to API
+        if self._api_key:
+            api_timeout = timeout if timeout < 180 else 180
+            return self._call_api(prompt, use_flash, api_timeout)
+
+        raise AIProcessingError("No Gemini provider available (CLI or API)")
+
+    def summarize(self, text: str, **kwargs) -> str:
+        """Generate summary using Gemini"""
+        prompt = f"""Summarize the following text:
+
+{text}
+
+Provide a clear, concise summary in Spanish."""
+        return self._run(prompt, use_flash=True)
+
+    def correct_text(self, text: str, **kwargs) -> str:
+        """Correct text using Gemini"""
+        prompt = f"""Correct the following text for grammar, spelling, and clarity:
+
+{text}
+
+Return only the corrected text, nothing else."""
+        return self._run(prompt, use_flash=True)
+
+    def classify_content(self, text: str, **kwargs) -> Dict[str, Any]:
+        """Classify content using Gemini"""
+        categories = ["historia", "analisis_contable", "instituciones_gobierno", "otras_clases"]
+
+        prompt = f"""Classify the following text into one of these categories:
+- historia
+- analisis_contable
+- instituciones_gobierno
+- otras_clases
+
+Text: {text}
+
+Return only the category name, nothing else."""
+        result = self._run(prompt, use_flash=True).lower()
+
+        # Validate result
+        if result not in categories:
+            result = "otras_clases"
+
+        return {
+            "category": result,
+            "confidence": 0.9,
+            "provider": self.name
+        }
--- a/backup/originals_20250109/processed_registry.py
+++ b/backup/originals_20250109/processed_registry.py
@@ -0,0 +1,137 @@
+"""
+Processed files registry using repository pattern
+"""
+import fcntl
+import logging
+from pathlib import Path
+from typing import Set, Optional
+from datetime import datetime, timedelta
+from ..config import settings
+
+
+class ProcessedRegistry:
+    """Registry for tracking processed files with caching and file locking"""
+
+    def __init__(self):
+        self.logger = logging.getLogger(__name__)
+        self._cache: Set[str] = set()
+        self._cache_time: Optional[datetime] = None
+        self._cache_ttl = 60
+        self._initialized = False
+
+    def initialize(self) -> None:
+        """Initialize the registry"""
+        self.load()
+        self._initialized = True
+
+    def load(self) -> Set[str]:
+        """Load processed files from disk with caching"""
+        now = datetime.utcnow()
+        if self._cache and self._cache_time and (now - self._cache_time).total_seconds() < self._cache_ttl:
+            return self._cache.copy()
+
+        processed = set()
+        registry_path = settings.processed_files_path
+
+        try:
+            registry_path.parent.mkdir(parents=True, exist_ok=True)
+            if registry_path.exists():
+                with open(registry_path, 'r', encoding='utf-8') as f:
+                    for raw_line in f:
+                        line = raw_line.strip()
+                        if line and not line.startswith('#'):
+                            processed.add(line)
+                            base_name = Path(line).name
+                            processed.add(base_name)
+        except Exception as e:
+            self.logger.error(f"Error reading processed files registry: {e}")
+
+        self._cache = processed
+        self._cache_time = now
+        return processed.copy()
+
+    def save(self, file_path: str) -> None:
+        """Add file to processed registry with file locking"""
+        if not file_path:
+            return
+        registry_path = settings.processed_files_path
+
+        try:
+            registry_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(registry_path, 'a', encoding='utf-8') as f:
+                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+                try:
+                    if file_path not in self._cache:
+                        f.write(file_path + "\n")
+                        self._cache.add(file_path)
+                        self.logger.debug(f"Added {file_path} to processed registry")
+                finally:
+                    fcntl.flock(f.fileno(), fcntl.LOCK_UN)
+        except Exception as e:
+            self.logger.error(f"Error saving to processed files registry: {e}")
+            raise
+
+    def is_processed(self, file_path: str) -> bool:
+        """Check if file has been processed"""
+        if not self._initialized:
+            self.initialize()
+        if file_path in self._cache:
+            return True
+        basename = Path(file_path).name
+        if basename in self._cache:
+            return True
+        return False
+
+    def remove(self, file_path: str) -> bool:
+        """Remove file from processed registry"""
+        registry_path = settings.processed_files_path
+
+        try:
+            if not registry_path.exists():
+                return False
+            lines_to_keep = []
+            with open(registry_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    if line.strip() != file_path and Path(line.strip()).name != Path(file_path).name:
+                        lines_to_keep.append(line)
+            with open(registry_path, 'w', encoding='utf-8') as f:
+                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+                try:
+                    f.writelines(lines_to_keep)
+                    self._cache.discard(file_path)
+                    self._cache.discard(Path(file_path).name)
+                finally:
+                    fcntl.flock(f.fileno(), fcntl.LOCK_UN)
+            return True
+        except Exception as e:
+            self.logger.error(f"Error removing from processed files registry: {e}")
+            return False
+
+    def clear(self) -> None:
+        """Clear the entire registry"""
+        registry_path = settings.processed_files_path
+        try:
+            if registry_path.exists():
+                registry_path.unlink()
+            self._cache.clear()
+            self._cache_time = None
+            self.logger.info("Processed files registry cleared")
+        except Exception as e:
+            self.logger.error(f"Error clearing processed files registry: {e}")
+            raise
+
+    def get_all(self) -> Set[str]:
+        """Get all processed files"""
+        if not self._initialized:
+            self.initialize()
+        return self._cache.copy()
+
+    def count(self) -> int:
+        """Get count of processed files"""
+        if not self._initialized:
+            self.initialize()
+        return len(self._cache)
+
+
+# Global instance
+processed_registry = ProcessedRegistry()