cbc2027/document/generators.py

"""
Document generation utilities - LaTeX Academic Summary System

This module generates comprehensive academic summaries in LaTeX format
following the specifications in latex/resumen.md (the SINGLE SOURCE OF TRUTH).

Parallel Processing: Uses multiple agents for accelerated summary generation:
- AI Provider Racing: Multiple AI providers generate in parallel
- Parallel Format Conversion: PDF + DOCX generated simultaneously
- Background Notion Uploads: Non-blocking uploads to Notion
"""

import logging
import subprocess
import shutil
import re
import threading
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, Callable
from concurrent.futures import ThreadPoolExecutor, as_completed

from core import FileProcessingError
from config import settings
from services.ai import ai_provider_factory
from services.ai.prompt_manager import prompt_manager


def _sanitize_latex(latex_code: str) -> str:
    """
    Pre-process LaTeX code to fix common errors before compilation.

    This function applies automated fixes for known issues that AI models
    frequently generate, reducing the need for fix_latex() iterations.

    Currently handles:
    - TikZ nodes with line breaks (\\\\) missing align=center
    - Unbalanced environments (best effort)
    """
    if not latex_code:
        return latex_code

    result = latex_code

    # Fix TikZ nodes with \\\\ but missing align=center
    # Pattern: \node[...] (name) {Text\\More};
    # This is a common AI error - TikZ requires align=center for \\\\ in nodes

    # We need to find \node commands and add align=center if they have \\\\ in content
    # but don't already have align= in their options

    def fix_tikz_node(match):
        """Fix a single TikZ node by adding align=center if needed"""
        full_match = match.group(0)
        options = match.group(1)  # Content inside [...]
        rest = match.group(2)  # Everything after options

        # Check if this node has \\\\ in its content (text between { })
        # and doesn't already have align=
        if "\\\\" in rest and "align=" not in options:
            # Add align=center to the options
            if options.strip():
                new_options = options.rstrip() + ", align=center"
            else:
                new_options = "align=center"
            return f"\\node[{new_options}]{rest}"

        return full_match

    # Match \node[options] followed by rest of the line
    # Capture options and the rest separately
    tikz_node_pattern = r"\\node\[([^\]]*)\]([^;]*;)"
    result = re.sub(tikz_node_pattern, fix_tikz_node, result)

    return result


class DocumentGenerator:
    """
    Generates academic summary documents in LaTeX format.

    The system follows these principles:
    1. latex/resumen.md is the SINGLE SOURCE OF TRUTH for prompt structure
    2. Generates full LaTeX documents (not Markdown)
    3. Compiles to PDF using pdflatex
    4. Supports iterative fixing with AI if compilation fails
    5. Supports progress notifications via callback
    """

    def __init__(self, notification_callback: Optional[Callable[[str], None]] = None):
        """
        Initialize DocumentGenerator.

        Args:
            notification_callback: Optional callback function for progress notifications
                                  Takes a single string argument (message to send)
        """
        self.logger = logging.getLogger(__name__)
        self.ai_provider = ai_provider_factory.get_best_provider()
        self.notification_callback = notification_callback
        self.use_parallel = ai_provider_factory.use_parallel()
        self.executor = ThreadPoolExecutor(max_workers=4)

        # Ensure output directories exist
        settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True)
        settings.LOCAL_DOCX.mkdir(parents=True, exist_ok=True)

        if self.use_parallel:
            self.logger.info(
                "🚀 Parallel processing enabled: Multiple AI providers available"
            )

    def _notify(self, message: str) -> None:
        """Send notification if callback is configured"""
        if self.notification_callback:
            try:
                self.notification_callback(message)
            except Exception as e:
                self.logger.warning(f"Failed to send notification: {e}")

    def _generate_with_parallel_provider(self, prompt: str, **kwargs) -> str:
        """
        Generate content using multiple AI providers in parallel.

        Races multiple providers and returns the first successful response,
        or the best quality response if using consensus strategy.
        """
        try:
            parallel_provider = ai_provider_factory.get_parallel_provider(max_workers=4)
            self.logger.info("🚀 Using parallel AI provider (race mode)")

            result = parallel_provider.generate_parallel(
                prompt=prompt,
                strategy="race",  # Use first successful response
                timeout_ms=300000,  # 5 minutes
                **kwargs,
            )

            self.logger.info(
                f"✅ Parallel generation complete: {result.selected_provider} selected, "
                f"{result.total_duration_ms}ms"
            )

            return result.content

        except Exception as e:
            self.logger.warning(
                f"⚠️ Parallel generation failed: {e}, falling back to single provider"
            )
            return self.ai_provider.generate_text(prompt, **kwargs)

    def _convert_formats_parallel(
        self, tex_path: Path, pdf_path: Optional[Path], base_name: str
    ) -> Optional[Path]:
        """
        Convert to multiple formats in parallel (DOCX, optionally PDF).

        If PDF is already compiled, only DOCX is generated.
        Otherwise, both PDF and DOCX are generated in parallel.
        """
        futures = {}

        # Generate DOCX
        if shutil.which("pandoc"):
            futures["docx"] = self.executor.submit(
                self._convert_tex_to_docx, tex_path, base_name
            )

        # Wait for DOCX completion
        docx_path = None
        if "docx" in futures:
            try:
                docx_path = futures["docx"].result(timeout=60)
                if docx_path:
                    self.logger.info(f"✅ Parallel DOCX generated: {docx_path}")
            except Exception as e:
                self.logger.warning(f"⚠️ DOCX generation failed: {e}")

        return docx_path

    def _upload_to_notion_background(
        self,
        base_name: str,
        summary: str,
        pdf_path: Optional[Path],
        metadata: Dict[str, Any],
    ):
        """Upload to Notion in background thread (non-blocking)."""

        def upload_worker():
            try:
                from services.notion_service import notion_service

                title = base_name.replace("_", " ").title()
                notion_metadata = {
                    "file_type": "Audio",
                    "pdf_path": pdf_path or Path(""),
                    "add_status": False,
                    "use_as_page": False,
                }

                page_id = notion_service.create_page_with_summary(
                    title=title, summary=summary, metadata=notion_metadata
                )

                if page_id:
                    metadata["notion_uploaded"] = True
                    metadata["notion_page_id"] = page_id
                    self.logger.info(
                        f"✅ Background upload to Notion complete: {title}"
                    )
                else:
                    self.logger.warning(f"⚠️ Background Notion upload failed: {title}")

            except Exception as e:
                self.logger.warning(f"❌ Background Notion upload error: {e}")

        # Start background thread
        thread = threading.Thread(target=upload_worker, daemon=True)
        thread.start()
        self.logger.info("🔄 Notion upload started in background")

    def generate_summary(
        self,
        text: str,
        base_name: str,
        materia: str = "Economía",
        bibliographic_text: Optional[str] = None,
        class_number: Optional[int] = None,
    ) -> Tuple[bool, str, Dict[str, Any]]:
        """
        Generate comprehensive academic summary in LaTeX format.

        Args:
            text: The class transcription text
            base_name: Base filename for output files
            materia: Subject name (default: "Economía")
            bibliographic_text: Optional supporting material from books/notes
            class_number: Optional class number for header

        Returns:
            Tuple of (success, summary_text, metadata)
        """
        self.logger.info(
            f"🚀 Starting LaTeX academic summary generation for: {base_name}"
        )

        metadata = {
            "filename": base_name,
            "tex_path": "",
            "pdf_path": "",
            "markdown_path": "",
            "docx_path": "",
            "summary_snippet": "",
            "notion_uploaded": False,
            "notion_page_id": None,
            "materia": materia,
        }

        try:
            # === STEP 1: Generate LaTeX content using AI ===
            self.logger.info(
                "🧠 Sending request to AI Provider for LaTeX generation..."
            )
            self._notify("📝 Preparando prompt de resumen académico...")

            prompt = prompt_manager.get_latex_summary_prompt(
                transcription=text,
                materia=materia,
                bibliographic_text=bibliographic_text,
                class_number=class_number,
            )

            self._notify(
                "🧠 Enviando solicitud a la IA (esto puede tardar unos minutos)..."
            )

            # Use parallel provider if multiple AI providers are available
            if self.use_parallel:
                raw_response = self._generate_with_parallel_provider(prompt)
            else:
                raw_response = self.ai_provider.generate_text(prompt)

            if not raw_response:
                raise FileProcessingError("AI returned empty response")

            self.logger.info(f"📝 AI response received: {len(raw_response)} characters")
            self._notify(f"✅ Respuesta recibida ({len(raw_response)} caracteres)")

            # === STEP 2: Extract clean LaTeX from AI response ===
            self._notify("🔍 Extrayendo código LaTeX...")

            latex_content = prompt_manager.extract_latex_from_response(raw_response)

            if not latex_content:
                self.logger.warning(
                    "⚠️ No valid LaTeX found in response, treating as Markdown"
                )
                self._notify("⚠️ No se detectó LaTeX válido, usando modo compatible...")
                # Fallback to Markdown processing
                return self._fallback_to_markdown(raw_response, base_name, metadata)

            self.logger.info("✨ Valid LaTeX content detected")
            self._notify(f"✨ LaTeX detectado: {len(latex_content)} caracteres")

            # === STEP 3: Compilation Loop with Self-Correction ===
            max_retries = 3
            current_latex = latex_content

            for attempt in range(max_retries + 1):
                # Sanitize LaTeX before saving (fix common AI errors like TikZ nodes)
                current_latex = _sanitize_latex(current_latex)

                # Save current .tex file
                self._notify(
                    f"📄 Guardando archivo .tex (intento {attempt + 1}/{max_retries + 1})..."
                )

                tex_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.tex"
                tex_path.write_text(current_latex, encoding="utf-8")
                metadata["tex_path"] = str(tex_path)

                # Try to compile
                self._notify("⚙️ Primera pasada de compilación LaTeX...")

                pdf_path = self._compile_latex(
                    tex_path, output_dir=settings.LOCAL_DOWNLOADS_PATH
                )

                if pdf_path:
                    self.logger.info(
                        f"✅ Compilation success on attempt {attempt + 1}!"
                    )
                    self._notify("✅ PDF generado exitosamente!")
                    metadata["pdf_path"] = str(pdf_path)

                    # Generate DOCX in parallel
                    self._notify("📄 Generando archivo DOCX en paralelo...")
                    docx_path = self._convert_formats_parallel(
                        tex_path, pdf_path, base_name
                    )
                    if docx_path:
                        self._notify("✅ DOCX generado exitosamente!")
                        metadata["docx_path"] = str(docx_path)

                    # Create a text summary for Notion/preview
                    text_summary = self._create_text_summary(current_latex)
                    metadata["summary_snippet"] = text_summary[:500] + "..."

                    # Upload to Notion in background if configured
                    if settings.has_notion_config:
                        self._notify("📤 Iniciando carga a Notion en segundo plano...")
                        self._upload_to_notion_background(
                            base_name=base_name,
                            summary=text_summary,
                            pdf_path=pdf_path,
                            metadata=metadata,
                        )

                    self._notify("🎉 ¡Resumen completado con éxito!")
                    return True, text_summary, metadata

                # Compilation failed - ask AI to fix
                if attempt < max_retries:
                    self.logger.warning(
                        f"⚠️ Compilation failed (Attempt {attempt + 1}/{max_retries + 1}). "
                        f"Requesting AI fix..."
                    )
                    self._notify(
                        f"⚠️ Error de compilación ({attempt + 1}/{max_retries + 1}), solicitando corrección a IA..."
                    )

                    # Get error log
                    log_file = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.log"
                    error_log = "Log file not found"
                    if log_file.exists():
                        error_log = log_file.read_text(
                            encoding="utf-8", errors="ignore"
                        )[-2000:]

                    # Ask AI to fix
                    try:
                        self._notify("🔧 La IA está corrigiendo el código LaTeX...")
                        if hasattr(self.ai_provider, "fix_latex"):
                            fixed_latex = self.ai_provider.fix_latex(
                                current_latex, error_log
                            )
                            cleaned = prompt_manager.extract_latex_from_response(
                                fixed_latex
                            )
                            if cleaned:
                                current_latex = cleaned
                            else:
                                current_latex = fixed_latex
                            self._notify(
                                "✅ Código LaTeX corregido, reintentando compilación..."
                            )
                        else:
                            self.logger.error(
                                "❌ AI provider doesn't support fix_latex()"
                            )
                            break
                    except Exception as e:
                        self.logger.error(f"❌ AI fix request failed: {e}")
                        break
                else:
                    self.logger.error(
                        "❌ Max retries reached. LaTeX compilation failed."
                    )
                    self._notify(
                        "❌ No se pudo compilar el LaTeX después de varios intentos"
                    )

            # If we get here, all compilation attempts failed
            self._notify("⚠️ Usando modo de compatibilidad Markdown...")
            return self._fallback_to_markdown(
                current_latex or raw_response, base_name, metadata
            )

        except Exception as e:
            self.logger.error(
                f"❌ Critical error in document generation: {e}", exc_info=True
            )
            self._notify(f"❌ Error en la generación: {str(e)[:100]}")
            return False, "", metadata

    def _compile_latex(self, tex_path: Path, output_dir: Path) -> Optional[Path]:
        """
        Compile LaTeX to PDF using pdflatex. Runs twice for TOC.

        Args:
            tex_path: Path to .tex file
            output_dir: Directory for output files

        Returns:
            Path to generated PDF or None if failed
        """
        base_name = tex_path.stem
        expected_pdf = output_dir / f"{base_name}.pdf"

        # Check if pdflatex is available
        if not shutil.which("pdflatex"):
            self.logger.error("🚫 pdflatex not found in system PATH")
            return None

        cmd = [
            "pdflatex",
            "-interaction=nonstopmode",
            "-halt-on-error",
            f"-output-directory={output_dir}",
            str(tex_path),
        ]

        try:
            # Pass 1
            self.logger.info("⚙️ Compiling LaTeX (Pass 1/2)...")
            subprocess.run(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False,
                timeout=120,
            )

            # Pass 2 (for TOC resolution)
            self.logger.info("⚙️ Compiling LaTeX (Pass 2/2)...")
            result = subprocess.run(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False,
                timeout=120,
            )

            if result.returncode == 0 and expected_pdf.exists():
                self.logger.info(f"✅ PDF generated: {expected_pdf}")
                self._cleanup_latex_aux(output_dir, base_name)
                return expected_pdf
            else:
                # Read log file for error info
                log_file = output_dir / f"{base_name}.log"
                error_snippet = "Unknown error"
                if log_file.exists():
                    try:
                        log_content = log_file.read_text(
                            encoding="utf-8", errors="ignore"
                        )
                        errors = [
                            line
                            for line in log_content.splitlines()
                            if line.startswith("!")
                        ]
                        if errors:
                            error_snippet = errors[0][:200]
                    except:
                        pass

                self.logger.error(f"❌ LaTeX compilation failed: {error_snippet}")
                return None

        except subprocess.TimeoutExpired:
            self.logger.error("❌ LaTeX compilation timed out")
            return None
        except Exception as e:
            self.logger.error(f"❌ Error during LaTeX execution: {e}")
            return None

    def _convert_tex_to_docx(self, tex_path: Path, base_name: str) -> Optional[Path]:
        """Convert .tex to .docx using Pandoc."""
        if not shutil.which("pandoc"):
            self.logger.warning("⚠️ pandoc not found, skipping DOCX generation")
            return None

        docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
        cmd = ["pandoc", str(tex_path), "-o", str(docx_path)]

        try:
            subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
            self.logger.info(f"✅ DOCX generated: {docx_path}")
            return docx_path
        except Exception as e:
            self.logger.warning(f"⚠️ DOCX generation failed: {e}")
            return None

    def _create_text_summary(self, latex_content: str) -> str:
        """Extract a plain text summary from LaTeX content for Notion/preview."""
        # Remove LaTeX commands and keep content
        text = latex_content

        # Remove document class and packages
        text = re.sub(r"\\documentclass\[?[^\]]*\]?\{[^\}]+\}", "", text)
        text = re.sub(r"\\usepackage\{[^\}]+\}", "", text)
        text = re.sub(r"\\geometry\{[^\}]+\}", "", text)
        text = re.sub(r"\\pagestyle\{[^\}]+\}", "", text)
        text = re.sub(r"\\fancyhf\{\}", "", text)
        text = re.sub(r"\\fancyhead\[?[^\]]*\]?\{[^\}]+\}", "", text)
        text = re.sub(r"\\fancyfoot\[?[^\]]*\]?\{[^\}]+\}", "", text)

        # Convert sections to markdown-style
        text = re.sub(r"\\section\*?\{([^\}]+)\}", r"# \1", text)
        text = re.sub(r"\\subsection\*?\{([^\}]+)\}", r"## \1", text)
        text = re.sub(r"\\subsubsection\*?\{([^\}]+)\}", r"### \1", text)

        # Remove tcolorbox environments (keep content)
        text = re.sub(
            r"\\begin\{(definicion|importante|ejemplo)\}\[?[^\]]*\]?",
            r"\n**\1:** ",
            text,
        )
        text = re.sub(r"\\end\{(definicion|importante|ejemplo)\}", "", text)

        # Convert itemize to bullets
        text = re.sub(r"\\item\s*", "- ", text)
        text = re.sub(r"\\begin\{(itemize|enumerate)\}", "", text)
        text = re.sub(r"\\end\{(itemize|enumerate)\}", "", text)

        # Clean up math (basic)
        text = re.sub(r"\$\$([^\$]+)\$\$", r"\n\n\1\n\n", text)
        text = re.sub(r"\$([^\$]+)\$", r"\1", text)

        # Remove remaining LaTeX commands
        text = re.sub(r"\\[a-zA-Z]+(\{[^\}]*\})*", "", text)
        text = re.sub(r"[{}]", "", text)

        # Clean whitespace
        text = re.sub(r"\n\s*\n\s*\n", "\n\n", text)
        text = text.strip()

        return text

    def _fallback_to_markdown(
        self, content: str, base_name: str, metadata: Dict[str, Any]
    ) -> Tuple[bool, str, Dict[str, Any]]:
        """Fallback when LaTeX generation fails."""
        self.logger.warning("⚠️ Falling back to Markdown processing")

        md_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_resumen.md"
        md_path.write_text(content, encoding="utf-8")
        metadata["markdown_path"] = str(md_path)

        # Try to convert to PDF via pandoc
        if shutil.which("pandoc"):
            pdf_path = self._convert_md_to_pdf(md_path, base_name)
            if pdf_path:
                metadata["pdf_path"] = str(pdf_path)

        docx_path = self._convert_md_to_docx(md_path, base_name)
        if docx_path:
            metadata["docx_path"] = str(docx_path)

        metadata["summary_snippet"] = content[:500] + "..."
        return True, content, metadata

    def _convert_md_to_pdf(self, md_path: Path, base_name: str) -> Optional[Path]:
        """Convert Markdown to PDF using pandoc."""
        pdf_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.pdf"
        cmd = [
            "pandoc",
            str(md_path),
            "-o",
            str(pdf_path),
            "--pdf-engine=pdflatex",
            "-V",
            "geometry:margin=2.5cm",
        ]

        try:
            subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
            self.logger.info(f"✅ PDF from Markdown: {pdf_path}")
            return pdf_path
        except Exception as e:
            self.logger.warning(f"⚠️ PDF from Markdown failed: {e}")
            return None

    def _convert_md_to_docx(self, md_path: Path, base_name: str) -> Optional[Path]:
        """Convert Markdown to DOCX using pandoc."""
        docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
        cmd = ["pandoc", str(md_path), "-o", str(docx_path)]

        try:
            subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
            self.logger.info(f"✅ DOCX from Markdown: {docx_path}")
            return docx_path
        except Exception as e:
            self.logger.warning(f"⚠️ DOCX from Markdown failed: {e}")
            return None

    def _cleanup_latex_aux(self, output_dir: Path, base_name: str):
        """Clean up auxiliary LaTeX files."""
        extensions = [".aux", ".log", ".out", ".toc"]
        for ext in extensions:
            aux_file = output_dir / f"{base_name}{ext}"
            if aux_file.exists():
                try:
                    aux_file.unlink()
                except:
                    pass

    def _upload_to_notion(
        self,
        base_name: str,
        summary: str,
        pdf_path: Optional[Path],
        metadata: Dict[str, Any],
    ):
        """Upload summary to Notion if configured."""
        try:
            from services.notion_service import notion_service

            title = base_name.replace("_", " ").title()
            notion_metadata = {
                "file_type": "Audio",
                "pdf_path": pdf_path or Path(""),
                "add_status": False,
                "use_as_page": False,
            }

            page_id = notion_service.create_page_with_summary(
                title=title, summary=summary, metadata=notion_metadata
            )

            if page_id:
                metadata["notion_uploaded"] = True
                metadata["notion_page_id"] = page_id
                self.logger.info(f"✅ Uploaded to Notion: {title}")
            else:
                self.logger.warning(f"⚠️ Notion upload failed: {title}")

        except Exception as e:
            self.logger.warning(f"❌ Notion upload error: {e}")