""" Document generation utilities """ import logging import subprocess import re from pathlib import Path from typing import Dict, Any, List, Tuple from core import FileProcessingError from config import settings from services.ai import ai_provider_factory class DocumentGenerator: """Generate documents from processed text""" def __init__(self): self.logger = logging.getLogger(__name__) self.ai_provider = ai_provider_factory.get_best_provider() def generate_summary( self, text: str, base_name: str ) -> Tuple[bool, str, Dict[str, Any]]: """Generate unified summary""" self.logger.info(f"Generating summary for {base_name}") try: # Step 1: Generate Bullet Points (Chunking handled by provider or single prompt for now) # Note: We use the main provider (Claude/Zai) for content generation self.logger.info("Generating bullet points...") bullet_prompt = f"""Analiza el siguiente texto y extrae entre 5 y 8 bullet points clave en español. REGLAS ESTRICTAS: 1. Devuelve ÚNICAMENTE bullet points, cada línea iniciando con "- " 2. Cada bullet debe ser conciso (12-20 palabras) y resaltar datos, fechas, conceptos o conclusiones importantes 3. NO agregues introducciones, conclusiones ni texto explicativo 4. Concéntrate en los puntos más importantes del texto 5. Incluye fechas, datos específicos y nombres relevantes si los hay Texto: {text[:15000]}""" # Truncate to avoid context limits if necessary, though providers handle it differently try: bullet_points = self.ai_provider.generate_text(bullet_prompt) self.logger.info(f"Bullet points generated: {len(bullet_points)}") except Exception as e: self.logger.warning(f"Bullet point generation failed: {e}") bullet_points = "- Puntos clave no disponibles por error en IA" # Step 2: Generate Unified Summary self.logger.info("Generating unified summary...") summary_prompt = f"""Eres un profesor universitario experto en historia y economía. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos. REQUISITOS ESTRICTOS DE CONTENIDO: - Extensión entre 500-700 palabras - Usa encabezados Markdown con jerarquía clara (##, ###) - Desarrolla los puntos clave con profundidad y contexto histórico/económico - Mantén un tono académico y analítico - Incluye conclusiones significativas - NO agregues texto fuera del resumen - Devuelve únicamente el resumen en formato Markdown REQUISITOS ESTRICTOS DE FORMATO MATEMÁTICO (LaTeX): - Si el texto incluye fórmulas matemáticas o económicas, DEBES usar formato LaTeX. - Usa bloques $$ ... $$ para ecuaciones centradas importantes. - Usa $ ... $ para ecuaciones en línea. - Ejemplo: La fórmula del interés compuesto es $A = P(1 + r/n)^{{nt}}$. - NO uses bloques de código (```latex) para las fórmulas, úsalas directamente en el texto para que Pandoc las renderice. Contenido a resumir: {text[:20000]} Puntos clave a incluir obligatoriamente: {bullet_points}""" try: raw_summary = self.ai_provider.generate_text(summary_prompt) except Exception as e: self.logger.error(f"Raw summary generation failed: {e}") raise e # Step 3: Format with IA (using main provider instead of Gemini) self.logger.info("Formatting summary with IA...") format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible y compatible con Pandoc: {raw_summary} Instrucciones: - Corrige cualquier error de formato Markdown - Asegúrate de que los encabezados estén bien espaciados - Verifica que las viñetas usen "- " correctamente - Mantén exactamente el contenido existente - EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave - VERIFICA que todas las fórmulas matemáticas estén correctamente encerradas en $...$ (inline) o $$...$$ (display) - NO alteres la sintaxis LaTeX dentro de los delimitadores $...$ o $$...$$ - Devuelve únicamente el resumen formateado sin texto adicional""" try: # Use the main provider (Claude/GLM) for formatting too if self.ai_provider.is_available(): summary = self.ai_provider.generate_text(format_prompt) else: self.logger.warning( "AI provider not available for formatting, using raw summary" ) summary = raw_summary except Exception as e: self.logger.warning(f"Formatting failed ({e}), using raw summary") summary = raw_summary # Generate filename filename = self._generate_filename(text, summary) # Create document markdown_path = self._create_markdown(summary, base_name) docx_path = None try: docx_path = self._create_docx(markdown_path, base_name) except Exception as e: self.logger.error(f"Failed to create DOCX (non-critical): {e}") pdf_path = None try: # Sanitize LaTeX before PDF generation self._sanitize_latex(markdown_path) pdf_path = self._create_pdf(markdown_path, base_name) except Exception as e: self.logger.error(f"Failed to create PDF (non-critical): {e}") # Upload to Notion if configured from services.notion_service import notion_service notion_uploaded = False notion_page_id = None if settings.has_notion_config: try: title = base_name.replace("_", " ").title() # Crear página con el contenido completo del resumen notion_metadata = { "file_type": "Audio", # O 'PDF' dependiendo del origen "pdf_path": pdf_path if pdf_path else Path(""), "add_status": False, # No usar Status/Tipo (no existen en la DB) "use_as_page": False, # Usar como database, no página } notion_page_id = notion_service.create_page_with_summary( title=title, summary=summary, metadata=notion_metadata ) if notion_page_id: notion_uploaded = True self.logger.info( f"✅ Resumen subido a Notion: {title} (ID: {notion_page_id})" ) else: self.logger.warning(f"⚠️ No se pudo subir a Notion: {title}") except Exception as e: self.logger.warning(f"❌ Error al subir a Notion: {e}") import traceback traceback.print_exc() else: self.logger.info("Notion not configured - skipping upload") metadata = { "markdown_path": str(markdown_path), "docx_path": str(docx_path) if docx_path else "", "pdf_path": str(pdf_path) if pdf_path else "", "docx_name": Path(docx_path).name if docx_path else "", "summary": summary, "filename": filename, "notion_uploaded": notion_uploaded, "notion_page_id": notion_page_id, } return True, summary, metadata except Exception as e: self.logger.error(f"Document generation process failed: {e}") return False, "", {} def _sanitize_latex(self, markdown_path: Path) -> None: """Sanitize LaTeX syntax in Markdown file to prevent Pandoc errors""" try: content = markdown_path.read_text(encoding="utf-8") # 1. Unescape escaped dollar signs which are common LLM errors for math content = content.replace(r"\$", "$") # 2. Fix common Cyrillic and Greek characters that sneak in via LLMs replacements = { "ч": "ch", "в": "v", "к": "k", "м": "m", "н": "n", "т": "t", "—": "-", "–": "-", "“": '"', "”": '"', "’": "'", "Δ": "$\\Delta$", "δ": "$\\delta$", "Σ": "$\\Sigma$", "σ": "$\\sigma$", "π": "$\\pi$", "Π": "$\\Pi$", "α": "$\\alpha$", "β": "$\\beta$", "γ": "$\\gamma$", "θ": "$\\theta$", "λ": "$\\lambda$", "μ": "$\\mu$", } # Be careful not to double-replace already correct LaTeX for char, repl in replacements.items(): if char in content: # Check if it's already inside math mode would be complex, # but for now we assume raw unicode greek chars should become latex content = content.replace(char, repl) markdown_path.write_text(content, encoding="utf-8") self.logger.info(f"Sanitized LaTeX in {markdown_path}") except Exception as e: self.logger.warning(f"Failed to sanitize LaTeX: {e}") def _generate_filename(self, text: str, summary: str) -> str: """Generate intelligent filename""" try: # Use AI to extract key topics prompt = f"""Extract 2-3 key topics from this summary to create a filename. Summary: {summary} Return only the topics separated by hyphens, max 20 chars each, in Spanish:""" try: topics_text = self.ai_provider.generate_text(prompt) except Exception: topics_text = summary[:100] # Simple topic extraction topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3] if not topics: topics = ["documento"] # Limit topic length topics = [t[: settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics] filename = "_".join(topics)[: settings.MAX_FILENAME_LENGTH] return filename except Exception as e: self.logger.error(f"Filename generation failed: {e}") return "documento" def _create_markdown(self, summary: str, base_name: str) -> Path: """Create Markdown document""" output_dir = settings.LOCAL_DOWNLOADS_PATH output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / f"{base_name}_unificado.md" content = f"""# {base_name.replace("_", " ").title()} ## Resumen {summary} --- *Generado por CBCFacil* """ with open(output_path, "w", encoding="utf-8") as f: f.write(content) return output_path def _create_docx(self, markdown_path: Path, base_name: str) -> Path: """Create DOCX document using pandoc""" output_dir = settings.LOCAL_DOCX output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / f"{base_name}_unificado.docx" self.logger.info( f"Converting Markdown to DOCX: {markdown_path} -> {output_path}" ) try: cmd = [ "pandoc", str(markdown_path), "-o", str(output_path), "--from=markdown", "--to=docx", ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) self.logger.info("DOCX generated successfully with pandoc") return output_path except subprocess.CalledProcessError as e: self.logger.error(f"Pandoc DOCX conversion failed: {e.stderr}") raise FileProcessingError(f"Failed to generate DOCX: {e.stderr}") except Exception as e: self.logger.error(f"Error generating DOCX: {e}") raise FileProcessingError(f"Error generating DOCX: {e}") def _create_pdf(self, markdown_path: Path, base_name: str) -> Path: """Create PDF document using pandoc and pdflatex""" output_dir = settings.LOCAL_DOWNLOADS_PATH output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / f"{base_name}_unificado.pdf" self.logger.info( f"Converting Markdown to PDF: {markdown_path} -> {output_path}" ) try: cmd = [ "pandoc", str(markdown_path), "-o", str(output_path), "--pdf-engine=pdflatex", "-V", "geometry:margin=2.5cm", "-V", "fontsize=12pt", "--highlight-style=tango", ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) self.logger.info("PDF generated successfully with pandoc") return output_path except subprocess.CalledProcessError as e: self.logger.error(f"Pandoc PDF conversion failed: {e.stderr}") raise FileProcessingError(f"Failed to generate PDF: {e.stderr}") except Exception as e: self.logger.error(f"Error generating PDF: {e}") raise FileProcessingError(f"Error generating PDF: {e}")