feat: Sistema LaTeX mejorado con sanitización automática y corrección de TikZ

Cambios principales: ## Nuevos archivos - services/ai/parallel_provider.py: Ejecución paralela de múltiples proveedores AI - services/ai/prompt_manager.py: Gestión centralizada de prompts (resumen.md como fuente) - latex/resumen.md: Template del prompt para resúmenes académicos LaTeX ## Mejoras en generación LaTeX (document/generators.py) - Nueva función _sanitize_latex(): Corrige automáticamente errores comunes de AI - Agrega align=center a nodos TikZ con saltos de línea (\\) - Previene errores 'Not allowed in LR mode' antes de compilar - Soporte para procesamiento paralelo de proveedores AI - Conversión DOCX en paralelo con generación PDF - Uploads a Notion en background (non-blocking) - Callbacks de notificación para progreso en Telegram ## Mejoras en proveedores AI - claude_provider.py: fix_latex() con instrucciones específicas para errores TikZ - gemini_provider.py: fix_latex() mejorado + rate limiting + circuit breaker - provider_factory.py: Soporte para parallel provider ## Otros cambios - config/settings.py: Nuevas configuraciones para Gemini models - services/webdav_service.py: Mejoras en manejo de conexión - .gitignore: Ignora archivos LaTeX auxiliares (.aux, .toc, .out, .pdf) ## Archivos de ejemplo - latex/imperio_romano.tex, latex/clase_revolucion_rusa_crisis_30.tex - resumen_curiosidades.tex (corregido y compilado exitosamente)
2026-02-07 20:50:27 +00:00
parent 915f827305
commit dcf887c510
15 changed files with 4309 additions and 409 deletions
--- a/services/ai/gemini_provider.py
+++ b/services/ai/gemini_provider.py
@@ -1,6 +1,7 @@
 """
 Gemini AI Provider - Optimized version with rate limiting and retry
 """
+
 import logging
 import subprocess
 import shutil
@@ -16,31 +17,32 @@ from .base_provider import AIProvider

 class TokenBucket:
    """Token bucket rate limiter"""
-    
+
    def __init__(self, rate: float = 10, capacity: int = 20):
        self.rate = rate  # tokens per second
        self.capacity = capacity
        self.tokens = capacity
        self.last_update = time.time()
        self._lock = None  # Lazy initialization
-    
+
    def _get_lock(self):
        if self._lock is None:
            import threading
+
            self._lock = threading.Lock()
        return self._lock
-    
+
    def acquire(self, tokens: int = 1) -> float:
        with self._get_lock():
            now = time.time()
            elapsed = now - self.last_update
            self.last_update = now
            self.tokens = min(self.capacity, self.tokens + elapsed * self.rate)
-            
+
            if self.tokens >= tokens:
                self.tokens -= tokens
                return 0.0
-            
+
            wait_time = (tokens - self.tokens) / self.rate
            self.tokens = 0
            return wait_time
@@ -48,7 +50,7 @@ class TokenBucket:

 class CircuitBreaker:
    """Circuit breaker for API calls"""
-    
+
    def __init__(self, failure_threshold: int = 5, recovery_timeout: int = 60):
        self.failure_threshold = failure_threshold
        self.recovery_timeout = recovery_timeout
@@ -56,21 +58,26 @@ class CircuitBreaker:
        self.last_failure: Optional[datetime] = None
        self.state = "closed"  # closed, open, half-open
        self._lock = None
-    
+
    def _get_lock(self):
        if self._lock is None:
            import threading
+
            self._lock = threading.Lock()
        return self._lock
-    
+
    def call(self, func, *args, **kwargs):
        with self._get_lock():
            if self.state == "open":
-                if self.last_failure and (datetime.utcnow() - self.last_failure).total_seconds() > self.recovery_timeout:
+                if (
+                    self.last_failure
+                    and (datetime.utcnow() - self.last_failure).total_seconds()
+                    > self.recovery_timeout
+                ):
                    self.state = "half-open"
                else:
                    raise AIProcessingError("Circuit breaker is open")
-            
+
            try:
                result = func(*args, **kwargs)
                if self.state == "half-open":
@@ -87,7 +94,7 @@ class CircuitBreaker:

 class GeminiProvider(AIProvider):
    """Gemini AI provider with rate limiting and retry"""
-    
+
    def __init__(self):
        super().__init__()
        self.logger = logging.getLogger(__name__)
@@ -102,17 +109,17 @@ class GeminiProvider(AIProvider):
            "max_attempts": 3,
            "base_delay": 1.0,
            "max_delay": 30.0,
-            "exponential_base": 2
+            "exponential_base": 2,
        }
-    
+
    @property
    def name(self) -> str:
        return "Gemini"
-    
+
    def is_available(self) -> bool:
        """Check if Gemini CLI or API is available"""
        return bool(self._cli_path or self._api_key)
-    
+
    def _init_session(self) -> None:
        """Initialize HTTP session with connection pooling"""
        if self._session is None:
@@ -120,17 +127,17 @@ class GeminiProvider(AIProvider):
            adapter = requests.adapters.HTTPAdapter(
                pool_connections=10,
                pool_maxsize=20,
-                max_retries=0  # We handle retries manually
+                max_retries=0,  # We handle retries manually
            )
-            self._session.mount('https://', adapter)
-    
+            self._session.mount("https://", adapter)
+
    def _run_with_retry(self, func, *args, **kwargs):
        """Execute function with exponential backoff retry"""
        max_attempts = self._retry_config["max_attempts"]
        base_delay = self._retry_config["base_delay"]
-        
+
        last_exception = None
-        
+
        for attempt in range(max_attempts):
            try:
                return self._circuit_breaker.call(func, *args, **kwargs)
@@ -138,94 +145,84 @@ class GeminiProvider(AIProvider):
                last_exception = e
                if attempt < max_attempts - 1:
                    delay = min(
-                        base_delay * (2 ** attempt),
-                        self._retry_config["max_delay"]
+                        base_delay * (2**attempt), self._retry_config["max_delay"]
                    )
                    # Add jitter
                    delay += delay * 0.1 * (time.time() % 1)
-                    self.logger.warning(f"Attempt {attempt + 1} failed: {e}, retrying in {delay:.2f}s")
+                    self.logger.warning(
+                        f"Attempt {attempt + 1} failed: {e}, retrying in {delay:.2f}s"
+                    )
                    time.sleep(delay)
-        
+
        raise AIProcessingError(f"Max retries exceeded: {last_exception}")
-    
+
    def _run_cli(self, prompt: str, use_flash: bool = True, timeout: int = 300) -> str:
        """Run Gemini CLI with prompt"""
        if not self._cli_path:
            raise AIProcessingError("Gemini CLI not available")
-        
+
        model = self._flash_model if use_flash else self._pro_model
        cmd = [self._cli_path, model, prompt]
-        
+
        try:
            # Apply rate limiting
            wait_time = self._rate_limiter.acquire()
            if wait_time > 0:
                time.sleep(wait_time)
-            
+
            process = subprocess.run(
-                cmd,
-                text=True,
-                capture_output=True,
-                timeout=timeout,
-                shell=False
+                cmd, text=True, capture_output=True, timeout=timeout, shell=False
            )
-            
+
            if process.returncode != 0:
                error_msg = process.stderr or "Unknown error"
                raise AIProcessingError(f"Gemini CLI failed: {error_msg}")
-            
+
            return process.stdout.strip()
        except subprocess.TimeoutExpired:
            raise AIProcessingError(f"Gemini CLI timed out after {timeout}s")
        except Exception as e:
            raise AIProcessingError(f"Gemini CLI error: {e}")
-    
+
    def _call_api(self, prompt: str, use_flash: bool = True, timeout: int = 180) -> str:
        """Call Gemini API with rate limiting and retry"""
        if not self._api_key:
            raise AIProcessingError("Gemini API key not configured")
-        
+
        self._init_session()
-        
+
        model = self._flash_model if use_flash else self._pro_model
        url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
-        
-        payload = {
-            "contents": [{
-                "parts": [{"text": prompt}]
-            }]
-        }
-        
+
+        payload = {"contents": [{"parts": [{"text": prompt}]}]}
+
        params = {"key": self._api_key}
-        
+
        def api_call():
            # Apply rate limiting
            wait_time = self._rate_limiter.acquire()
            if wait_time > 0:
                time.sleep(wait_time)
-            
+
            response = self._session.post(
-                url,
-                json=payload,
-                params=params,
-                timeout=timeout
+                url, json=payload, params=params, timeout=timeout
            )
            response.raise_for_status()
            return response
-        
+
        response = self._run_with_retry(api_call)
        data = response.json()
-        
+
        if "candidates" not in data or not data["candidates"]:
            raise AIProcessingError("Empty response from Gemini API")
-        
+
        candidate = data["candidates"][0]
        if "content" not in candidate or "parts" not in candidate["content"]:
            raise AIProcessingError("Invalid response format from Gemini API")
-        
+
        result = candidate["content"]["parts"][0]["text"]
        return result.strip()
-    
+
    def _run(self, prompt: str, use_flash: bool = True, timeout: int = 300) -> str:
        """Run Gemini with fallback between CLI and API"""
        # Try CLI first if available
@@ -234,14 +231,14 @@ class GeminiProvider(AIProvider):
                return self._run_cli(prompt, use_flash, timeout)
            except Exception as e:
                self.logger.warning(f"Gemini CLI failed, trying API: {e}")
-        
+
        # Fallback to API
        if self._api_key:
            api_timeout = min(timeout, 180)
            return self._call_api(prompt, use_flash, api_timeout)
-        
+
        raise AIProcessingError("No Gemini provider available (CLI or API)")
-    
+
    def summarize(self, text: str, **kwargs) -> str:
        """Generate summary using Gemini"""
        prompt = f"""Summarize the following text:
@@ -250,7 +247,7 @@ class GeminiProvider(AIProvider):

 Provide a clear, concise summary in Spanish."""
        return self._run(prompt, use_flash=True)
-    
+
    def correct_text(self, text: str, **kwargs) -> str:
        """Correct text using Gemini"""
        prompt = f"""Correct the following text for grammar, spelling, and clarity:
@@ -259,11 +256,16 @@ Provide a clear, concise summary in Spanish."""

 Return only the corrected text, nothing else."""
        return self._run(prompt, use_flash=True)
-    
+
    def classify_content(self, text: str, **kwargs) -> Dict[str, Any]:
        """Classify content using Gemini"""
-        categories = ["historia", "analisis_contable", "instituciones_gobierno", "otras_clases"]
-        
+        categories = [
+            "historia",
+            "analisis_contable",
+            "instituciones_gobierno",
+            "otras_clases",
+        ]
+
        prompt = f"""Classify the following text into one of these categories:
 - historia
 - analisis_contable
@@ -274,39 +276,61 @@ Text: {text}

 Return only the category name, nothing else."""
        result = self._run(prompt, use_flash=True).lower()
-        
+
        # Validate result
        if result not in categories:
            result = "otras_clases"
-        
-        return {
-            "category": result,
-            "confidence": 0.9,
-            "provider": self.name
-        }
+
+        return {"category": result, "confidence": 0.9, "provider": self.name}

    def generate_text(self, prompt: str, **kwargs) -> str:
        """Generate text using Gemini"""
-        use_flash = kwargs.get('use_flash', True)
+        use_flash = kwargs.get("use_flash", True)
        if self._api_key:
            return self._call_api(prompt, use_flash=use_flash)
-        return self._call_cli(prompt, use_yolo=True)
-    
+        return self._run_cli(prompt, use_flash=use_flash)
+
+    def fix_latex(self, latex_code: str, error_log: str, **kwargs) -> str:
+        """Fix broken LaTeX code using Gemini"""
+        prompt = f"""Fix the following LaTeX code which failed to compile.
+
+Error Log:
+{error_log[-3000:]}
+
+Broken Code:
+{latex_code}
+
+INSTRUCTIONS:
+1. Return ONLY the corrected LaTeX code. No explanations.
+2. Start immediately with \\documentclass.
+
+COMMON LATEX ERRORS TO FIX:
+- TikZ nodes with line breaks (\\\\) MUST have "align=center" in their style.
+  WRONG: \\node[box] (n) {{Text\\\\More}};
+  CORRECT: \\node[box, align=center] (n) {{Text\\\\More}};
+- All \\begin{{env}} must have matching \\end{{env}}
+- All braces {{ }} must be balanced
+- Math mode $ must be paired
+- Special characters need escaping: % & # _
+- tcolorbox environments need proper titles: [Title] not {{Title}}
+"""
+        return self._run(prompt, use_flash=False)  # Use Pro model for coding fixes
+
    def get_stats(self) -> Dict[str, Any]:
        """Get provider statistics"""
        return {
            "rate_limiter": {
                "tokens": round(self._rate_limiter.tokens, 2),
                "capacity": self._rate_limiter.capacity,
-                "rate": self._rate_limiter.rate
+                "rate": self._rate_limiter.rate,
            },
            "circuit_breaker": {
                "state": self._circuit_breaker.state,
                "failures": self._circuit_breaker.failures,
-                "failure_threshold": self._circuit_breaker.failure_threshold
+                "failure_threshold": self._circuit_breaker.failure_threshold,
            },
            "cli_available": bool(self._cli_path),
-            "api_available": bool(self._api_key)
+            "api_available": bool(self._api_key),
        }