""" Document generation utilities - LaTeX Academic Summary System This module generates comprehensive academic summaries in LaTeX format following the specifications in latex/resumen.md (the SINGLE SOURCE OF TRUTH). Parallel Processing: Uses multiple agents for accelerated summary generation: - AI Provider Racing: Multiple AI providers generate in parallel - Parallel Format Conversion: PDF + DOCX generated simultaneously - Background Notion Uploads: Non-blocking uploads to Notion """ import logging import subprocess import shutil import re import threading from pathlib import Path from typing import Dict, Any, Optional, Tuple, Callable from concurrent.futures import ThreadPoolExecutor, as_completed from core import FileProcessingError from config import settings from services.ai import ai_provider_factory from services.ai.prompt_manager import prompt_manager def _sanitize_latex(latex_code: str) -> str: """ Pre-process LaTeX code to fix common errors before compilation. This function applies automated fixes for known issues that AI models frequently generate, reducing the need for fix_latex() iterations. Currently handles: - TikZ nodes with line breaks (\\\\) missing align=center - Unbalanced environments (best effort) """ if not latex_code: return latex_code result = latex_code # Fix TikZ nodes with \\\\ but missing align=center # Pattern: \node[...] (name) {Text\\More}; # This is a common AI error - TikZ requires align=center for \\\\ in nodes # We need to find \node commands and add align=center if they have \\\\ in content # but don't already have align= in their options def fix_tikz_node(match): """Fix a single TikZ node by adding align=center if needed""" full_match = match.group(0) options = match.group(1) # Content inside [...] rest = match.group(2) # Everything after options # Check if this node has \\\\ in its content (text between { }) # and doesn't already have align= if "\\\\" in rest and "align=" not in options: # Add align=center to the options if options.strip(): new_options = options.rstrip() + ", align=center" else: new_options = "align=center" return f"\\node[{new_options}]{rest}" return full_match # Match \node[options] followed by rest of the line # Capture options and the rest separately tikz_node_pattern = r"\\node\[([^\]]*)\]([^;]*;)" result = re.sub(tikz_node_pattern, fix_tikz_node, result) return result class DocumentGenerator: """ Generates academic summary documents in LaTeX format. The system follows these principles: 1. latex/resumen.md is the SINGLE SOURCE OF TRUTH for prompt structure 2. Generates full LaTeX documents (not Markdown) 3. Compiles to PDF using pdflatex 4. Supports iterative fixing with AI if compilation fails 5. Supports progress notifications via callback """ def __init__(self, notification_callback: Optional[Callable[[str], None]] = None): """ Initialize DocumentGenerator. Args: notification_callback: Optional callback function for progress notifications Takes a single string argument (message to send) """ self.logger = logging.getLogger(__name__) self.ai_provider = ai_provider_factory.get_best_provider() self.notification_callback = notification_callback self.use_parallel = ai_provider_factory.use_parallel() self.executor = ThreadPoolExecutor(max_workers=4) # Ensure output directories exist settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True) settings.LOCAL_DOCX.mkdir(parents=True, exist_ok=True) if self.use_parallel: self.logger.info( "🚀 Parallel processing enabled: Multiple AI providers available" ) def _notify(self, message: str) -> None: """Send notification if callback is configured""" if self.notification_callback: try: self.notification_callback(message) except Exception as e: self.logger.warning(f"Failed to send notification: {e}") def _generate_with_parallel_provider(self, prompt: str, **kwargs) -> str: """ Generate content using multiple AI providers in parallel. Races multiple providers and returns the first successful response, or the best quality response if using consensus strategy. """ try: parallel_provider = ai_provider_factory.get_parallel_provider(max_workers=4) self.logger.info("🚀 Using parallel AI provider (race mode)") result = parallel_provider.generate_parallel( prompt=prompt, strategy="race", # Use first successful response timeout_ms=300000, # 5 minutes **kwargs, ) self.logger.info( f"✅ Parallel generation complete: {result.selected_provider} selected, " f"{result.total_duration_ms}ms" ) return result.content except Exception as e: self.logger.warning( f"⚠️ Parallel generation failed: {e}, falling back to single provider" ) return self.ai_provider.generate_text(prompt, **kwargs) def _convert_formats_parallel( self, tex_path: Path, pdf_path: Optional[Path], base_name: str ) -> Optional[Path]: """ Convert to multiple formats in parallel (DOCX, optionally PDF). If PDF is already compiled, only DOCX is generated. Otherwise, both PDF and DOCX are generated in parallel. """ futures = {} # Generate DOCX if shutil.which("pandoc"): futures["docx"] = self.executor.submit( self._convert_tex_to_docx, tex_path, base_name ) # Wait for DOCX completion docx_path = None if "docx" in futures: try: docx_path = futures["docx"].result(timeout=60) if docx_path: self.logger.info(f"✅ Parallel DOCX generated: {docx_path}") except Exception as e: self.logger.warning(f"⚠️ DOCX generation failed: {e}") return docx_path def _upload_to_notion_background( self, base_name: str, summary: str, pdf_path: Optional[Path], metadata: Dict[str, Any], ): """Upload to Notion in background thread (non-blocking).""" def upload_worker(): try: from services.notion_service import notion_service title = base_name.replace("_", " ").title() notion_metadata = { "file_type": "Audio", "pdf_path": pdf_path or Path(""), "add_status": False, "use_as_page": False, } page_id = notion_service.create_page_with_summary( title=title, summary=summary, metadata=notion_metadata ) if page_id: metadata["notion_uploaded"] = True metadata["notion_page_id"] = page_id self.logger.info( f"✅ Background upload to Notion complete: {title}" ) else: self.logger.warning(f"⚠️ Background Notion upload failed: {title}") except Exception as e: self.logger.warning(f"❌ Background Notion upload error: {e}") # Start background thread thread = threading.Thread(target=upload_worker, daemon=True) thread.start() self.logger.info("🔄 Notion upload started in background") def generate_summary( self, text: str, base_name: str, materia: str = "Economía", bibliographic_text: Optional[str] = None, class_number: Optional[int] = None, ) -> Tuple[bool, str, Dict[str, Any]]: """ Generate comprehensive academic summary in LaTeX format. Args: text: The class transcription text base_name: Base filename for output files materia: Subject name (default: "Economía") bibliographic_text: Optional supporting material from books/notes class_number: Optional class number for header Returns: Tuple of (success, summary_text, metadata) """ self.logger.info( f"🚀 Starting LaTeX academic summary generation for: {base_name}" ) metadata = { "filename": base_name, "tex_path": "", "pdf_path": "", "markdown_path": "", "docx_path": "", "summary_snippet": "", "notion_uploaded": False, "notion_page_id": None, "materia": materia, } try: # === STEP 1: Generate LaTeX content using AI === self.logger.info( "🧠 Sending request to AI Provider for LaTeX generation..." ) self._notify("📝 Preparando prompt de resumen académico...") prompt = prompt_manager.get_latex_summary_prompt( transcription=text, materia=materia, bibliographic_text=bibliographic_text, class_number=class_number, ) self._notify( "🧠 Enviando solicitud a la IA (esto puede tardar unos minutos)..." ) # Use parallel provider if multiple AI providers are available if self.use_parallel: raw_response = self._generate_with_parallel_provider(prompt) else: raw_response = self.ai_provider.generate_text(prompt) if not raw_response: raise FileProcessingError("AI returned empty response") self.logger.info(f"📝 AI response received: {len(raw_response)} characters") self._notify(f"✅ Respuesta recibida ({len(raw_response)} caracteres)") # === STEP 2: Extract clean LaTeX from AI response === self._notify("🔍 Extrayendo código LaTeX...") latex_content = prompt_manager.extract_latex_from_response(raw_response) if not latex_content: self.logger.warning( "⚠️ No valid LaTeX found in response, treating as Markdown" ) self._notify("⚠️ No se detectó LaTeX válido, usando modo compatible...") # Fallback to Markdown processing return self._fallback_to_markdown(raw_response, base_name, metadata) self.logger.info("✨ Valid LaTeX content detected") self._notify(f"✨ LaTeX detectado: {len(latex_content)} caracteres") # === STEP 3: Compilation Loop with Self-Correction === max_retries = 3 current_latex = latex_content for attempt in range(max_retries + 1): # Sanitize LaTeX before saving (fix common AI errors like TikZ nodes) current_latex = _sanitize_latex(current_latex) # Save current .tex file self._notify( f"📄 Guardando archivo .tex (intento {attempt + 1}/{max_retries + 1})..." ) tex_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.tex" tex_path.write_text(current_latex, encoding="utf-8") metadata["tex_path"] = str(tex_path) # Try to compile self._notify("⚙️ Primera pasada de compilación LaTeX...") pdf_path = self._compile_latex( tex_path, output_dir=settings.LOCAL_DOWNLOADS_PATH ) if pdf_path: self.logger.info( f"✅ Compilation success on attempt {attempt + 1}!" ) self._notify("✅ PDF generado exitosamente!") metadata["pdf_path"] = str(pdf_path) # Generate DOCX in parallel self._notify("📄 Generando archivo DOCX en paralelo...") docx_path = self._convert_formats_parallel( tex_path, pdf_path, base_name ) if docx_path: self._notify("✅ DOCX generado exitosamente!") metadata["docx_path"] = str(docx_path) # Create a text summary for Notion/preview text_summary = self._create_text_summary(current_latex) metadata["summary_snippet"] = text_summary[:500] + "..." # Upload to Notion in background if configured if settings.has_notion_config: self._notify("📤 Iniciando carga a Notion en segundo plano...") self._upload_to_notion_background( base_name=base_name, summary=text_summary, pdf_path=pdf_path, metadata=metadata, ) self._notify("🎉 ¡Resumen completado con éxito!") return True, text_summary, metadata # Compilation failed - ask AI to fix if attempt < max_retries: self.logger.warning( f"⚠️ Compilation failed (Attempt {attempt + 1}/{max_retries + 1}). " f"Requesting AI fix..." ) self._notify( f"⚠️ Error de compilación ({attempt + 1}/{max_retries + 1}), solicitando corrección a IA..." ) # Get error log log_file = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.log" error_log = "Log file not found" if log_file.exists(): error_log = log_file.read_text( encoding="utf-8", errors="ignore" )[-2000:] # Ask AI to fix try: self._notify("🔧 La IA está corrigiendo el código LaTeX...") if hasattr(self.ai_provider, "fix_latex"): fixed_latex = self.ai_provider.fix_latex( current_latex, error_log ) cleaned = prompt_manager.extract_latex_from_response( fixed_latex ) if cleaned: current_latex = cleaned else: current_latex = fixed_latex self._notify( "✅ Código LaTeX corregido, reintentando compilación..." ) else: self.logger.error( "❌ AI provider doesn't support fix_latex()" ) break except Exception as e: self.logger.error(f"❌ AI fix request failed: {e}") break else: self.logger.error( "❌ Max retries reached. LaTeX compilation failed." ) self._notify( "❌ No se pudo compilar el LaTeX después de varios intentos" ) # If we get here, all compilation attempts failed self._notify("⚠️ Usando modo de compatibilidad Markdown...") return self._fallback_to_markdown( current_latex or raw_response, base_name, metadata ) except Exception as e: self.logger.error( f"❌ Critical error in document generation: {e}", exc_info=True ) self._notify(f"❌ Error en la generación: {str(e)[:100]}") return False, "", metadata def _compile_latex(self, tex_path: Path, output_dir: Path) -> Optional[Path]: """ Compile LaTeX to PDF using pdflatex. Runs twice for TOC. Args: tex_path: Path to .tex file output_dir: Directory for output files Returns: Path to generated PDF or None if failed """ base_name = tex_path.stem expected_pdf = output_dir / f"{base_name}.pdf" # Check if pdflatex is available if not shutil.which("pdflatex"): self.logger.error("🚫 pdflatex not found in system PATH") return None cmd = [ "pdflatex", "-interaction=nonstopmode", "-halt-on-error", f"-output-directory={output_dir}", str(tex_path), ] try: # Pass 1 self.logger.info("⚙️ Compiling LaTeX (Pass 1/2)...") subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False, timeout=120, ) # Pass 2 (for TOC resolution) self.logger.info("⚙️ Compiling LaTeX (Pass 2/2)...") result = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False, timeout=120, ) if result.returncode == 0 and expected_pdf.exists(): self.logger.info(f"✅ PDF generated: {expected_pdf}") self._cleanup_latex_aux(output_dir, base_name) return expected_pdf else: # Read log file for error info log_file = output_dir / f"{base_name}.log" error_snippet = "Unknown error" if log_file.exists(): try: log_content = log_file.read_text( encoding="utf-8", errors="ignore" ) errors = [ line for line in log_content.splitlines() if line.startswith("!") ] if errors: error_snippet = errors[0][:200] except: pass self.logger.error(f"❌ LaTeX compilation failed: {error_snippet}") return None except subprocess.TimeoutExpired: self.logger.error("❌ LaTeX compilation timed out") return None except Exception as e: self.logger.error(f"❌ Error during LaTeX execution: {e}") return None def _convert_tex_to_docx(self, tex_path: Path, base_name: str) -> Optional[Path]: """Convert .tex to .docx using Pandoc.""" if not shutil.which("pandoc"): self.logger.warning("⚠️ pandoc not found, skipping DOCX generation") return None docx_path = settings.LOCAL_DOCX / f"{base_name}.docx" cmd = ["pandoc", str(tex_path), "-o", str(docx_path)] try: subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60) self.logger.info(f"✅ DOCX generated: {docx_path}") return docx_path except Exception as e: self.logger.warning(f"⚠️ DOCX generation failed: {e}") return None def _create_text_summary(self, latex_content: str) -> str: """Extract a plain text summary from LaTeX content for Notion/preview.""" # Remove LaTeX commands and keep content text = latex_content # Remove document class and packages text = re.sub(r"\\documentclass\[?[^\]]*\]?\{[^\}]+\}", "", text) text = re.sub(r"\\usepackage\{[^\}]+\}", "", text) text = re.sub(r"\\geometry\{[^\}]+\}", "", text) text = re.sub(r"\\pagestyle\{[^\}]+\}", "", text) text = re.sub(r"\\fancyhf\{\}", "", text) text = re.sub(r"\\fancyhead\[?[^\]]*\]?\{[^\}]+\}", "", text) text = re.sub(r"\\fancyfoot\[?[^\]]*\]?\{[^\}]+\}", "", text) # Convert sections to markdown-style text = re.sub(r"\\section\*?\{([^\}]+)\}", r"# \1", text) text = re.sub(r"\\subsection\*?\{([^\}]+)\}", r"## \1", text) text = re.sub(r"\\subsubsection\*?\{([^\}]+)\}", r"### \1", text) # Remove tcolorbox environments (keep content) text = re.sub( r"\\begin\{(definicion|importante|ejemplo)\}\[?[^\]]*\]?", r"\n**\1:** ", text, ) text = re.sub(r"\\end\{(definicion|importante|ejemplo)\}", "", text) # Convert itemize to bullets text = re.sub(r"\\item\s*", "- ", text) text = re.sub(r"\\begin\{(itemize|enumerate)\}", "", text) text = re.sub(r"\\end\{(itemize|enumerate)\}", "", text) # Clean up math (basic) text = re.sub(r"\$\$([^\$]+)\$\$", r"\n\n\1\n\n", text) text = re.sub(r"\$([^\$]+)\$", r"\1", text) # Remove remaining LaTeX commands text = re.sub(r"\\[a-zA-Z]+(\{[^\}]*\})*", "", text) text = re.sub(r"[{}]", "", text) # Clean whitespace text = re.sub(r"\n\s*\n\s*\n", "\n\n", text) text = text.strip() return text def _fallback_to_markdown( self, content: str, base_name: str, metadata: Dict[str, Any] ) -> Tuple[bool, str, Dict[str, Any]]: """Fallback when LaTeX generation fails.""" self.logger.warning("⚠️ Falling back to Markdown processing") md_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_resumen.md" md_path.write_text(content, encoding="utf-8") metadata["markdown_path"] = str(md_path) # Try to convert to PDF via pandoc if shutil.which("pandoc"): pdf_path = self._convert_md_to_pdf(md_path, base_name) if pdf_path: metadata["pdf_path"] = str(pdf_path) docx_path = self._convert_md_to_docx(md_path, base_name) if docx_path: metadata["docx_path"] = str(docx_path) metadata["summary_snippet"] = content[:500] + "..." return True, content, metadata def _convert_md_to_pdf(self, md_path: Path, base_name: str) -> Optional[Path]: """Convert Markdown to PDF using pandoc.""" pdf_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.pdf" cmd = [ "pandoc", str(md_path), "-o", str(pdf_path), "--pdf-engine=pdflatex", "-V", "geometry:margin=2.5cm", ] try: subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60) self.logger.info(f"✅ PDF from Markdown: {pdf_path}") return pdf_path except Exception as e: self.logger.warning(f"⚠️ PDF from Markdown failed: {e}") return None def _convert_md_to_docx(self, md_path: Path, base_name: str) -> Optional[Path]: """Convert Markdown to DOCX using pandoc.""" docx_path = settings.LOCAL_DOCX / f"{base_name}.docx" cmd = ["pandoc", str(md_path), "-o", str(docx_path)] try: subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60) self.logger.info(f"✅ DOCX from Markdown: {docx_path}") return docx_path except Exception as e: self.logger.warning(f"⚠️ DOCX from Markdown failed: {e}") return None def _cleanup_latex_aux(self, output_dir: Path, base_name: str): """Clean up auxiliary LaTeX files.""" extensions = [".aux", ".log", ".out", ".toc"] for ext in extensions: aux_file = output_dir / f"{base_name}{ext}" if aux_file.exists(): try: aux_file.unlink() except: pass def _upload_to_notion( self, base_name: str, summary: str, pdf_path: Optional[Path], metadata: Dict[str, Any], ): """Upload summary to Notion if configured.""" try: from services.notion_service import notion_service title = base_name.replace("_", " ").title() notion_metadata = { "file_type": "Audio", "pdf_path": pdf_path or Path(""), "add_status": False, "use_as_page": False, } page_id = notion_service.create_page_with_summary( title=title, summary=summary, metadata=notion_metadata ) if page_id: metadata["notion_uploaded"] = True metadata["notion_page_id"] = page_id self.logger.info(f"✅ Uploaded to Notion: {title}") else: self.logger.warning(f"⚠️ Notion upload failed: {title}") except Exception as e: self.logger.warning(f"❌ Notion upload error: {e}")