cbc2027/document/generators.py

"""
Document generation utilities
"""

import logging
import subprocess
import re
from pathlib import Path
from typing import Dict, Any, List, Tuple
from core import FileProcessingError
from config import settings
from services.ai import ai_provider_factory


class DocumentGenerator:
    """Generate documents from processed text"""

    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.ai_provider = ai_provider_factory.get_best_provider()

    def generate_summary(
        self, text: str, base_name: str
    ) -> Tuple[bool, str, Dict[str, Any]]:
        """Generate unified summary"""
        self.logger.info(f"Generating summary for {base_name}")

        try:
            # Step 1: Generate Bullet Points (Chunking handled by provider or single prompt for now)
            # Note: We use the main provider (Claude/Zai) for content generation
            self.logger.info("Generating bullet points...")
            bullet_prompt = f"""Analiza el siguiente texto y extrae entre 5 y 8 bullet points clave en español.

REGLAS ESTRICTAS:
1. Devuelve ÚNICAMENTE bullet points, cada línea iniciando con "- "
2. Cada bullet debe ser conciso (12-20 palabras) y resaltar datos, fechas, conceptos o conclusiones importantes
3. NO agregues introducciones, conclusiones ni texto explicativo
4. Concéntrate en los puntos más importantes del texto
5. Incluye fechas, datos específicos y nombres relevantes si los hay

Texto:
{text[:15000]}"""  # Truncate to avoid context limits if necessary, though providers handle it differently

            try:
                bullet_points = self.ai_provider.generate_text(bullet_prompt)
                self.logger.info(f"Bullet points generated: {len(bullet_points)}")
            except Exception as e:
                self.logger.warning(f"Bullet point generation failed: {e}")
                bullet_points = "- Puntos clave no disponibles por error en IA"

            # Step 2: Generate Unified Summary
            self.logger.info("Generating unified summary...")
            summary_prompt = f"""Eres un profesor universitario experto en historia y economía. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.

REQUISITOS ESTRICTOS DE CONTENIDO:
- Extensión entre 500-700 palabras
- Usa encabezados Markdown con jerarquía clara (##, ###)
- Desarrolla los puntos clave con profundidad y contexto histórico/económico
- Mantén un tono académico y analítico
- Incluye conclusiones significativas
- NO agregues texto fuera del resumen
- Devuelve únicamente el resumen en formato Markdown

REQUISITOS ESTRICTOS DE FORMATO MATEMÁTICO (LaTeX):
- Si el texto incluye fórmulas matemáticas o económicas, DEBES usar formato LaTeX.
- Usa bloques $$ ... $$ para ecuaciones centradas importantes.
- Usa $ ... $ para ecuaciones en línea.
- Ejemplo: La fórmula del interés compuesto es $A = P(1 + r/n)^{{nt}}$.
- NO uses bloques de código (```latex) para las fórmulas, úsalas directamente en el texto para que Pandoc las renderice.

Contenido a resumir:
{text[:20000]}

Puntos clave a incluir obligatoriamente:
{bullet_points}"""

            try:
                raw_summary = self.ai_provider.generate_text(summary_prompt)
            except Exception as e:
                self.logger.error(f"Raw summary generation failed: {e}")
                raise e

            # Step 3: Format with IA (using main provider instead of Gemini)
            self.logger.info("Formatting summary with IA...")
            format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible y compatible con Pandoc:

{raw_summary}

Instrucciones:
- Corrige cualquier error de formato Markdown
- Asegúrate de que los encabezados estén bien espaciados
- Verifica que las viñetas usen "- " correctamente
- Mantén exactamente el contenido existente
- EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave
- VERIFICA que todas las fórmulas matemáticas estén correctamente encerradas en $...$ (inline) o $$...$$ (display)
- NO alteres la sintaxis LaTeX dentro de los delimitadores $...$ o $$...$$
- Devuelve únicamente el resumen formateado sin texto adicional"""

            try:
                # Use the main provider (Claude/GLM) for formatting too
                if self.ai_provider.is_available():
                    summary = self.ai_provider.generate_text(format_prompt)
                else:
                    self.logger.warning(
                        "AI provider not available for formatting, using raw summary"
                    )
                    summary = raw_summary
            except Exception as e:
                self.logger.warning(f"Formatting failed ({e}), using raw summary")
                summary = raw_summary

            # Generate filename
            filename = self._generate_filename(text, summary)

            # Create document
            markdown_path = self._create_markdown(summary, base_name)

            docx_path = None
            try:
                docx_path = self._create_docx(markdown_path, base_name)
            except Exception as e:
                self.logger.error(f"Failed to create DOCX (non-critical): {e}")

            pdf_path = None
            try:
                # Sanitize LaTeX before PDF generation
                self._sanitize_latex(markdown_path)
                pdf_path = self._create_pdf(markdown_path, base_name)
            except Exception as e:
                self.logger.error(f"Failed to create PDF (non-critical): {e}")

            # Upload to Notion if configured
            from services.notion_service import notion_service

            notion_uploaded = False
            notion_page_id = None
            if settings.has_notion_config:
                try:
                    title = base_name.replace("_", " ").title()

                    # Crear página con el contenido completo del resumen
                    notion_metadata = {
                        "file_type": "Audio",  # O 'PDF' dependiendo del origen
                        "pdf_path": pdf_path if pdf_path else Path(""),
                        "add_status": False,  # No usar Status/Tipo (no existen en la DB)
                        "use_as_page": False,  # Usar como database, no página
                    }

                    notion_page_id = notion_service.create_page_with_summary(
                        title=title, summary=summary, metadata=notion_metadata
                    )

                    if notion_page_id:
                        notion_uploaded = True
                        self.logger.info(
                            f"✅ Resumen subido a Notion: {title} (ID: {notion_page_id})"
                        )
                    else:
                        self.logger.warning(f"⚠️ No se pudo subir a Notion: {title}")
                except Exception as e:
                    self.logger.warning(f"❌ Error al subir a Notion: {e}")
                    import traceback

                    traceback.print_exc()
            else:
                self.logger.info("Notion not configured - skipping upload")

            metadata = {
                "markdown_path": str(markdown_path),
                "docx_path": str(docx_path) if docx_path else "",
                "pdf_path": str(pdf_path) if pdf_path else "",
                "docx_name": Path(docx_path).name if docx_path else "",
                "summary": summary,
                "filename": filename,
                "notion_uploaded": notion_uploaded,
                "notion_page_id": notion_page_id,
            }

            return True, summary, metadata

        except Exception as e:
            self.logger.error(f"Document generation process failed: {e}")
            return False, "", {}

    def _sanitize_latex(self, markdown_path: Path) -> None:
        """Sanitize LaTeX syntax in Markdown file to prevent Pandoc errors"""
        try:
            content = markdown_path.read_text(encoding="utf-8")

            # 1. Unescape escaped dollar signs which are common LLM errors for math
            content = content.replace(r"\$", "$")

            # 2. Fix common Cyrillic and Greek characters that sneak in via LLMs
            replacements = {
                "ч": "ch",
                "в": "v",
                "к": "k",
                "м": "m",
                "н": "n",
                "т": "t",
                "—": "-",
                "–": "-",
                "“": '"',
                "”": '"',
                "’": "'",
                "Δ": "$\\Delta$",
                "δ": "$\\delta$",
                "Σ": "$\\Sigma$",
                "σ": "$\\sigma$",
                "π": "$\\pi$",
                "Π": "$\\Pi$",
                "α": "$\\alpha$",
                "β": "$\\beta$",
                "γ": "$\\gamma$",
                "θ": "$\\theta$",
                "λ": "$\\lambda$",
                "μ": "$\\mu$",
            }

            # Be careful not to double-replace already correct LaTeX
            for char, repl in replacements.items():
                if char in content:
                    # Check if it's already inside math mode would be complex,
                    # but for now we assume raw unicode greek chars should become latex
                    content = content.replace(char, repl)

            markdown_path.write_text(content, encoding="utf-8")
            self.logger.info(f"Sanitized LaTeX in {markdown_path}")
        except Exception as e:
            self.logger.warning(f"Failed to sanitize LaTeX: {e}")

    def _generate_filename(self, text: str, summary: str) -> str:
        """Generate intelligent filename"""
        try:
            # Use AI to extract key topics
            prompt = f"""Extract 2-3 key topics from this summary to create a filename.
Summary: {summary}

Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""

            try:
                topics_text = self.ai_provider.generate_text(prompt)
            except Exception:
                topics_text = summary[:100]

            # Simple topic extraction
            topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3]
            if not topics:
                topics = ["documento"]

            # Limit topic length
            topics = [t[: settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]

            filename = "_".join(topics)[: settings.MAX_FILENAME_LENGTH]
            return filename

        except Exception as e:
            self.logger.error(f"Filename generation failed: {e}")
            return "documento"

    def _create_markdown(self, summary: str, base_name: str) -> Path:
        """Create Markdown document"""
        output_dir = settings.LOCAL_DOWNLOADS_PATH
        output_dir.mkdir(parents=True, exist_ok=True)

        output_path = output_dir / f"{base_name}_unificado.md"

        content = f"""# {base_name.replace("_", " ").title()}

## Resumen

{summary}

---

*Generado por CBCFacil*
"""

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(content)

        return output_path

    def _create_docx(self, markdown_path: Path, base_name: str) -> Path:
        """Create DOCX document using pandoc"""
        output_dir = settings.LOCAL_DOCX
        output_dir.mkdir(parents=True, exist_ok=True)

        output_path = output_dir / f"{base_name}_unificado.docx"

        self.logger.info(
            f"Converting Markdown to DOCX: {markdown_path} -> {output_path}"
        )

        try:
            cmd = [
                "pandoc",
                str(markdown_path),
                "-o",
                str(output_path),
                "--from=markdown",
                "--to=docx",
            ]

            result = subprocess.run(cmd, capture_output=True, text=True, check=True)

            self.logger.info("DOCX generated successfully with pandoc")
            return output_path

        except subprocess.CalledProcessError as e:
            self.logger.error(f"Pandoc DOCX conversion failed: {e.stderr}")
            raise FileProcessingError(f"Failed to generate DOCX: {e.stderr}")
        except Exception as e:
            self.logger.error(f"Error generating DOCX: {e}")
            raise FileProcessingError(f"Error generating DOCX: {e}")

    def _create_pdf(self, markdown_path: Path, base_name: str) -> Path:
        """Create PDF document using pandoc and pdflatex"""
        output_dir = settings.LOCAL_DOWNLOADS_PATH
        output_dir.mkdir(parents=True, exist_ok=True)

        output_path = output_dir / f"{base_name}_unificado.pdf"

        self.logger.info(
            f"Converting Markdown to PDF: {markdown_path} -> {output_path}"
        )

        try:
            cmd = [
                "pandoc",
                str(markdown_path),
                "-o",
                str(output_path),
                "--pdf-engine=pdflatex",
                "-V",
                "geometry:margin=2.5cm",
                "-V",
                "fontsize=12pt",
                "--highlight-style=tango",
            ]

            result = subprocess.run(cmd, capture_output=True, text=True, check=True)

            self.logger.info("PDF generated successfully with pandoc")
            return output_path

        except subprocess.CalledProcessError as e:
            self.logger.error(f"Pandoc PDF conversion failed: {e.stderr}")
            raise FileProcessingError(f"Failed to generate PDF: {e.stderr}")
        except Exception as e:
            self.logger.error(f"Error generating PDF: {e}")
            raise FileProcessingError(f"Error generating PDF: {e}")