Restore full pipeline: 3-step summarization, formatting, PDF/DOCX generation

2026-01-09 17:01:22 -03:00
parent b017504c52
commit e6a01d08d4
20 changed files with 260 additions and 43 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -65,3 +65,9 @@ cbc-main.pid
 ehthumbs.db
 Thumbs.db
 .aider*
+
+# Temporary files from restoration
+old/
+imperio/
+check_models.py
+compare_configs.py
--- a/api/routes.py
+++ b/api/routes.py
@@ -8,10 +8,10 @@ from typing import Dict, Any, List
 from flask import Flask, render_template, request, jsonify, send_from_directory
 from flask_cors import CORS

-from ..config import settings
-from ..storage import processed_registry
-from ..services.webdav_service import webdav_service
-from ..services import vram_manager
+from config import settings
+from storage import processed_registry
+from services.webdav_service import webdav_service
+from services import vram_manager


 def create_app() -> Flask:
--- a/config/settings.py
+++ b/config/settings.py
@@ -61,8 +61,8 @@ class Settings:

    # Gemini
    GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
-    GEMINI_FLASH_MODEL: Optional[str] = os.getenv("GEMINI_FLASH_MODEL")
-    GEMINI_PRO_MODEL: Optional[str] = os.getenv("GEMINI_PRO_MODEL")
+    GEMINI_FLASH_MODEL: str = os.getenv("GEMINI_FLASH_MODEL", "gemini-2.5-flash")
+    GEMINI_PRO_MODEL: str = os.getenv("GEMINI_PRO_MODEL", "gemini-1.5-pro")

    # CLI paths
    GEMINI_CLI_PATH: Optional[str] = os.getenv("GEMINI_CLI_PATH")
--- a/core/init.py
+++ b/core/init.py
@@ -10,6 +10,7 @@ from .exceptions import (
    FileProcessingError
 )
 from .result import Result
+from .base_service import BaseService

 __all__ = [
    'ProcessingError',
@@ -17,5 +18,6 @@ __all__ = [
    'AIProcessingError',
    'ConfigurationError',
    'FileProcessingError',
-    'Result'
+    'Result',
+    'BaseService'
 ]
--- a/document/generators.py
+++ b/document/generators.py
@@ -5,9 +5,9 @@ import logging
 import re
 from pathlib import Path
 from typing import Dict, Any, List, Tuple
-from ..core import FileProcessingError
-from ..config import settings
-from ..services.ai import ai_provider_factory
+from core import FileProcessingError
+from config import settings
+from services.ai import ai_provider_factory


 class DocumentGenerator:
@@ -22,8 +22,79 @@ class DocumentGenerator:
        self.logger.info(f"Generating summary for {base_name}")

        try:
-            # Generate summary
-            summary = self.ai_provider.summarize(text)
+            # Step 1: Generate Bullet Points (Chunking handled by provider or single prompt for now)
+            # Note: We use the main provider (Claude/Zai) for content generation
+            self.logger.info("Generating bullet points...")
+            bullet_prompt = f"""Analiza el siguiente texto y extrae entre 5 y 8 bullet points clave en español.
+
+REGLAS ESTRICTAS:
+1. Devuelve ÚNICAMENTE bullet points, cada línea iniciando con "- "
+2. Cada bullet debe ser conciso (12-20 palabras) y resaltar datos, fechas, conceptos o conclusiones importantes
+3. NO agregues introducciones, conclusiones ni texto explicativo
+4. Concéntrate en los puntos más importantes del texto
+5. Incluye fechas, datos específicos y nombres relevantes si los hay
+
+Texto:
+{text[:15000]}"""  # Truncate to avoid context limits if necessary, though providers handle it differently
+            
+            try:
+                bullet_points = self.ai_provider.generate_text(bullet_prompt)
+                self.logger.info(f"Bullet points generated: {len(bullet_points)}")
+            except Exception as e:
+                self.logger.warning(f"Bullet point generation failed: {e}")
+                bullet_points = "- Puntos clave no disponibles por error en IA"
+
+            # Step 2: Generate Unified Summary
+            self.logger.info("Generating unified summary...")
+            summary_prompt = f"""Eres un profesor universitario experto en historia del siglo XX. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.
+
+REQUISITOS ESTRICTOS:
+- Extensión entre 500-700 palabras
+- Usa encabezados Markdown con jerarquía clara (##, ###)
+- Desarrolla los puntos clave con profundidad y contexto histórico
+- Mantén un tono académico y analítico
+- Incluye conclusiones significativas
+- NO agregues texto fuera del resumen
+- Devuelve únicamente el resumen en formato Markdown
+
+Contenido a resumir:
+{text[:20000]}
+
+Puntos clave a incluir obligatoriamente:
+{bullet_points}"""
+
+            try:
+                raw_summary = self.ai_provider.generate_text(summary_prompt)
+            except Exception as e:
+                self.logger.error(f"Raw summary generation failed: {e}")
+                raise e
+
+            # Step 3: Format with Gemini (using GeminiProvider explicitly)
+            self.logger.info("Formatting summary with Gemini...")
+            format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible:
+
+{raw_summary}
+
+Instrucciones:
+- Corrige cualquier error de formato
+- Asegúrate de que los encabezados estén bien espaciados
+- Verifica que las viñetas usen "- " correctamente
+- Mantén exactamente el contenido existente
+- Devuelve únicamente el resumen formateado sin texto adicional"""
+
+            # Use generic Gemini provider for formatting as requested
+            from services.ai.gemini_provider import GeminiProvider
+            formatter = GeminiProvider()
+            
+            try:
+                if formatter.is_available():
+                    summary = formatter.generate_text(format_prompt)
+                else:
+                    self.logger.warning("Gemini formatter not available, using raw summary")
+                    summary = raw_summary
+            except Exception as e:
+                self.logger.warning(f"Formatting failed ({e}), using raw summary")
+                summary = raw_summary

            # Generate filename
            filename = self._generate_filename(text, summary)
@@ -45,7 +116,7 @@ class DocumentGenerator:
            return True, summary, metadata

        except Exception as e:
-            self.logger.error(f"Summary generation failed: {e}")
+            self.logger.error(f"Document generation process failed: {e}")
            return False, "", {}

    def _generate_filename(self, text: str, summary: str) -> str:
--- a/main.py
+++ b/main.py
@@ -276,7 +276,95 @@ def run_main_loop() -> None:
                    for file_path in audio_files:
                        if any(file_path.lower().endswith(ext) for ext in settings.AUDIO_EXTENSIONS):
                            if not processed_registry.is_processed(file_path):
-                                audio_processor.process(file_path)
+                                from pathlib import Path
+                                from urllib.parse import unquote
+                                from document.generators import DocumentGenerator
+                                from services.telegram_service import telegram_service
+                                
+                                local_filename = unquote(Path(file_path).name)
+                                base_name = Path(local_filename).stem
+                                local_path = settings.LOCAL_DOWNLOADS_PATH / local_filename
+                                settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True)
+                                
+                                # Step 1: Notify and download
+                                telegram_service.send_message(
+                                    f"🎵 Nuevo audio detectado: {local_filename}\n"
+                                    f"⬇️ Descargando..."
+                                )
+                                logger.info(f"Downloading audio: {file_path} -> {local_path}")
+                                webdav_service.download(file_path, local_path)
+                                
+                                # Step 2: Transcribe
+                                telegram_service.send_message(f"📝 Transcribiendo audio con Whisper...")
+                                result = audio_processor.process(str(local_path))
+                                
+                                if result.get("success") and result.get("transcription_path"):
+                                    transcription_file = Path(result["transcription_path"])
+                                    transcription_text = result.get("text", "")
+                                    
+                                    # Step 3: Generate AI summary and documents
+                                    telegram_service.send_message(f"🤖 Generando resumen con IA...")
+                                    doc_generator = DocumentGenerator()
+                                    success, summary, output_files = doc_generator.generate_summary(
+                                        transcription_text, base_name
+                                    )
+                                    
+                                    # Step 4: Upload all files to Nextcloud
+                                    if success and output_files:
+                                        # Create folders
+                                        for folder in [settings.RESUMENES_FOLDER, settings.DOCX_FOLDER]:
+                                            try:
+                                                webdav_service.makedirs(folder)
+                                            except Exception:
+                                                pass
+                                        
+                                        # Upload transcription TXT
+                                        if transcription_file.exists():
+                                            remote_txt = f"{settings.RESUMENES_FOLDER}/{transcription_file.name}"
+                                            webdav_service.upload(transcription_file, remote_txt)
+                                            logger.info(f"Uploaded: {remote_txt}")
+                                        
+                                        # Upload DOCX
+                                        docx_path = Path(output_files.get('docx_path', ''))
+                                        if docx_path.exists():
+                                            remote_docx = f"{settings.DOCX_FOLDER}/{docx_path.name}"
+                                            webdav_service.upload(docx_path, remote_docx)
+                                            logger.info(f"Uploaded: {remote_docx}")
+                                        
+                                        # Upload PDF
+                                        pdf_path = Path(output_files.get('pdf_path', ''))
+                                        if pdf_path.exists():
+                                            remote_pdf = f"{settings.DOCX_FOLDER}/{pdf_path.name}"
+                                            webdav_service.upload(pdf_path, remote_pdf)
+                                            logger.info(f"Uploaded: {remote_pdf}")
+                                        
+                                        # Upload Markdown
+                                        md_path = Path(output_files.get('markdown_path', ''))
+                                        if md_path.exists():
+                                            remote_md = f"{settings.RESUMENES_FOLDER}/{md_path.name}"
+                                            webdav_service.upload(md_path, remote_md)
+                                            logger.info(f"Uploaded: {remote_md}")
+                                        
+                                        # Final notification
+                                        telegram_service.send_message(
+                                            f"✅ Audio procesado: {local_filename}\n"
+                                            f"📄 DOCX: {docx_path.name if docx_path.exists() else 'N/A'}\n"
+                                            f"📑 PDF: {pdf_path.name if pdf_path.exists() else 'N/A'}\n"
+                                            f"☁️ Subido a Nextcloud"
+                                        )
+                                    else:
+                                        # Just upload transcription if summary failed
+                                        if transcription_file.exists():
+                                            try:
+                                                webdav_service.makedirs(settings.RESUMENES_FOLDER)
+                                            except Exception:
+                                                pass
+                                            remote_txt = f"{settings.RESUMENES_FOLDER}/{transcription_file.name}"
+                                            webdav_service.upload(transcription_file, remote_txt)
+                                            telegram_service.send_message(
+                                                f"⚠️ Resumen fallido, solo transcripción subida:\n{transcription_file.name}"
+                                            )
+                                
                                processed_registry.save(file_path)
                except Exception as e:
                    logger.exception(f"Error processing audio: {e}")
--- a/processors/audio_processor.py
+++ b/processors/audio_processor.py
@@ -4,10 +4,10 @@ Audio file processor using Whisper
 import logging
 from pathlib import Path
 from typing import Dict, Any
-from ..core import FileProcessingError
-from ..config import settings
-from ..services import vram_manager
-from ..services.gpu_detector import gpu_detector
+from core import FileProcessingError
+from config import settings
+from services import vram_manager
+from services.gpu_detector import gpu_detector
 from .base_processor import FileProcessor

 try:
--- a/processors/base_processor.py
+++ b/processors/base_processor.py
@@ -4,7 +4,7 @@ Base File Processor (Strategy Pattern)
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Dict, Any, Optional
-from ..core import FileProcessingError
+from core import FileProcessingError


 class FileProcessor(ABC):
--- a/processors/pdf_processor.py
+++ b/processors/pdf_processor.py
@@ -5,10 +5,10 @@ import logging
 from pathlib import Path
 from typing import Dict, Any
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from ..core import FileProcessingError
-from ..config import settings
-from ..services import vram_manager
-from ..services.gpu_detector import gpu_detector
+from core import FileProcessingError
+from config import settings
+from services import vram_manager
+from services.gpu_detector import gpu_detector
 from .base_processor import FileProcessor

 try:
@@ -22,6 +22,11 @@ try:
    PDF_OCR_AVAILABLE = True
 except ImportError:
    PDF_OCR_AVAILABLE = False
+    # Provide stub for type hints
+    try:
+        from PIL import Image
+    except ImportError:
+        Image = None  # type: ignore


 class PDFProcessor(FileProcessor):
--- a/processors/text_processor.py
+++ b/processors/text_processor.py
@@ -4,8 +4,8 @@ Text file processor
 import logging
 from pathlib import Path
 from typing import Dict, Any
-from ..core import FileProcessingError
-from ..config import settings
+from core import FileProcessingError
+from config import settings
 from .base_processor import FileProcessor


--- a/services/ai/init.py
+++ b/services/ai/init.py
@@ -5,11 +5,16 @@ AI Providers package for CBCFacil
 from .base_provider import AIProvider
 from .claude_provider import ClaudeProvider
 from .gemini_provider import GeminiProvider
-from .provider_factory import AIProviderFactory
+from .provider_factory import AIProviderFactory, ai_provider_factory
+
+# Alias for backwards compatibility
+ai_service = ai_provider_factory

 __all__ = [
    'AIProvider',
    'ClaudeProvider',
    'GeminiProvider',
-    'AIProviderFactory'
+    'AIProviderFactory',
+    'ai_provider_factory',
+    'ai_service'
 ]
--- a/services/ai/base_provider.py
+++ b/services/ai/base_provider.py
@@ -23,6 +23,11 @@ class AIProvider(ABC):
        """Classify content into categories"""
        pass

+    @abstractmethod
+    def generate_text(self, prompt: str, **kwargs) -> str:
+        """Generate text from prompt"""
+        pass
+
    @abstractmethod
    def is_available(self) -> bool:
        """Check if provider is available and configured"""
--- a/services/ai/claude_provider.py
+++ b/services/ai/claude_provider.py
@@ -6,8 +6,8 @@ import subprocess
 import shutil
 from typing import Dict, Any, Optional

-from ..config import settings
-from ..core import AIProcessingError
+from config import settings
+from core import AIProcessingError
 from .base_provider import AIProvider


@@ -106,3 +106,7 @@ Return only the category name, nothing else."""
            "confidence": 0.9,
            "provider": self.name
        }
+
+    def generate_text(self, prompt: str, **kwargs) -> str:
+        """Generate text using Claude"""
+        return self._run_cli(prompt)
--- a/services/ai/gemini_provider.py
+++ b/services/ai/gemini_provider.py
@@ -9,8 +9,8 @@ import time
 from typing import Dict, Any, Optional
 from datetime import datetime, timedelta

-from ..config import settings
-from ..core import AIProcessingError
+from config import settings
+from core import AIProcessingError
 from .base_provider import AIProvider


@@ -90,6 +90,7 @@ class GeminiProvider(AIProvider):
    
    def __init__(self):
        super().__init__()
+        self.logger = logging.getLogger(__name__)
        self._cli_path = settings.GEMINI_CLI_PATH or shutil.which("gemini")
        self._api_key = settings.GEMINI_API_KEY
        self._flash_model = settings.GEMINI_FLASH_MODEL
@@ -104,6 +105,14 @@ class GeminiProvider(AIProvider):
            "exponential_base": 2
        }
    
+    @property
+    def name(self) -> str:
+        return "Gemini"
+    
+    def is_available(self) -> bool:
+        """Check if Gemini CLI or API is available"""
+        return bool(self._cli_path or self._api_key)
+    
    def _init_session(self) -> None:
        """Initialize HTTP session with connection pooling"""
        if self._session is None:
@@ -275,6 +284,13 @@ Return only the category name, nothing else."""
            "confidence": 0.9,
            "provider": self.name
        }
+
+    def generate_text(self, prompt: str, **kwargs) -> str:
+        """Generate text using Gemini"""
+        use_flash = kwargs.get('use_flash', True)
+        if self._api_key:
+            return self._call_api(prompt, use_flash=use_flash)
+        return self._call_cli(prompt, use_yolo=True)
    
    def get_stats(self) -> Dict[str, Any]:
        """Get provider statistics"""
--- a/services/ai/provider_factory.py
+++ b/services/ai/provider_factory.py
@@ -4,7 +4,7 @@ AI Provider Factory (Factory Pattern)
 import logging
 from typing import Dict, Type

-from ..core import AIProcessingError
+from core import AIProcessingError
 from .base_provider import AIProvider
 from .claude_provider import ClaudeProvider
 from .gemini_provider import GeminiProvider
--- a/services/ai_service.py
+++ b/services/ai_service.py
@@ -7,8 +7,8 @@ import time
 from typing import Optional, Dict, Any
 from threading import Lock

-from ..config import settings
-from ..core import AIProcessingError
+from config import settings
+from core import AIProcessingError
 from .ai.provider_factory import AIProviderFactory, ai_provider_factory


--- a/services/telegram_service.py
+++ b/services/telegram_service.py
@@ -5,7 +5,7 @@ import logging
 import time
 from typing import Optional
 from datetime import datetime
-from ..config import settings
+from config import settings

 try:
    import requests
--- a/services/vram_manager.py
+++ b/services/vram_manager.py
@@ -7,8 +7,8 @@ import os
 import time
 from datetime import datetime, timedelta
 from typing import Optional, Dict, Any
-from ..core import BaseService
-from ..config import settings
+from core import BaseService
+from config import settings

 try:
    import torch
--- a/services/webdav_service.py
+++ b/services/webdav_service.py
@@ -13,8 +13,8 @@ import requests
 from requests.auth import HTTPBasicAuth
 from requests.adapters import HTTPAdapter

-from ..config import settings
-from ..core import WebDAVError
+from config import settings
+from core import WebDAVError


 class WebDAVService:
@@ -112,16 +112,31 @@ class WebDAVService:
        files = []
        try:
            import xml.etree.ElementTree as ET
+            from urllib.parse import urlparse, unquote
            root = ET.fromstring(xml_response)

+            # Get the WebDAV path from settings
+            parsed_url = urlparse(settings.NEXTCLOUD_URL)
+            webdav_path = parsed_url.path.rstrip('/')  # e.g. /remote.php/webdav
+
            # Find all href elements
            for href in root.findall('.//{DAV:}href'):
                href_text = href.text or ""
+                href_text = unquote(href_text)  # Decode URL encoding
+                
                # Remove base URL from href
                base_url = settings.NEXTCLOUD_URL.rstrip('/')
                if href_text.startswith(base_url):
                    href_text = href_text[len(base_url):]
-                files.append(href_text.lstrip('/'))
+                
+                # Also strip the webdav path if it's there
+                if href_text.startswith(webdav_path):
+                    href_text = href_text[len(webdav_path):]
+                
+                # Clean up the path
+                href_text = href_text.lstrip('/')
+                if href_text:  # Skip empty paths (root directory)
+                    files.append(href_text)
        except Exception as e:
            self.logger.error(f"Error parsing PROPFIND response: {e}")

@@ -178,8 +193,8 @@ class WebDAVService:
                self._request('MKCOL', current)
                self.logger.debug(f"Created directory: {current}")
            except WebDAVError as e:
-                # Directory might already exist (409 Conflict is OK)
-                if '409' not in str(e):
+                # Directory might already exist (409 Conflict or 405 MethodNotAllowed is OK)
+                if '409' not in str(e) and '405' not in str(e):
                    raise

    def delete(self, remote_path: str) -> None:
--- a/storage/processed_registry.py
+++ b/storage/processed_registry.py
@@ -7,7 +7,7 @@ import time
 from pathlib import Path
 from typing import Set, Optional
 from datetime import datetime, timedelta
-from ..config import settings
+from config import settings


 class BloomFilter: