Files
cbcren2026/document/generators.py
renato97 915f827305 feat: Implementación de Resúmenes Matemáticos con LaTeX y Pandoc
##  Novedades
- **Soporte LaTeX**: Generación de PDFs y DOCX con fórmulas matemáticas renderizadas correctamente usando Pandoc.
- **Sanitización Automática**: Corrección de caracteres Unicode (griegos/cirílicos) y sintaxis LaTeX para evitar errores de compilación.
- **GLM/Claude Prioritario**: Cambio de proveedor de IA predeterminado a Claude/GLM para mayor estabilidad y capacidad de razonamiento.
- **Mejoras en Formato**: El formateo final del resumen ahora usa el modelo principal (GLM) en lugar de Gemini para consistencia.

## 🛠️ Cambios Técnicos
- `document/generators.py`: Reemplazo de generación manual por `pandoc`. Añadida función `_sanitize_latex`.
- `services/ai/claude_provider.py`: Soporte mejorado para variables de entorno de Z.ai.
- `services/ai/provider_factory.py`: Prioridad ajustada `Claude > Gemini`.
- `latex/`: Añadida documentación de referencia para el pipeline LaTeX.
2026-01-26 23:40:16 +00:00

353 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Document generation utilities
"""
import logging
import subprocess
import re
from pathlib import Path
from typing import Dict, Any, List, Tuple
from core import FileProcessingError
from config import settings
from services.ai import ai_provider_factory
class DocumentGenerator:
"""Generate documents from processed text"""
def __init__(self):
self.logger = logging.getLogger(__name__)
self.ai_provider = ai_provider_factory.get_best_provider()
def generate_summary(
self, text: str, base_name: str
) -> Tuple[bool, str, Dict[str, Any]]:
"""Generate unified summary"""
self.logger.info(f"Generating summary for {base_name}")
try:
# Step 1: Generate Bullet Points (Chunking handled by provider or single prompt for now)
# Note: We use the main provider (Claude/Zai) for content generation
self.logger.info("Generating bullet points...")
bullet_prompt = f"""Analiza el siguiente texto y extrae entre 5 y 8 bullet points clave en español.
REGLAS ESTRICTAS:
1. Devuelve ÚNICAMENTE bullet points, cada línea iniciando con "- "
2. Cada bullet debe ser conciso (12-20 palabras) y resaltar datos, fechas, conceptos o conclusiones importantes
3. NO agregues introducciones, conclusiones ni texto explicativo
4. Concéntrate en los puntos más importantes del texto
5. Incluye fechas, datos específicos y nombres relevantes si los hay
Texto:
{text[:15000]}""" # Truncate to avoid context limits if necessary, though providers handle it differently
try:
bullet_points = self.ai_provider.generate_text(bullet_prompt)
self.logger.info(f"Bullet points generated: {len(bullet_points)}")
except Exception as e:
self.logger.warning(f"Bullet point generation failed: {e}")
bullet_points = "- Puntos clave no disponibles por error en IA"
# Step 2: Generate Unified Summary
self.logger.info("Generating unified summary...")
summary_prompt = f"""Eres un profesor universitario experto en historia y economía. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.
REQUISITOS ESTRICTOS DE CONTENIDO:
- Extensión entre 500-700 palabras
- Usa encabezados Markdown con jerarquía clara (##, ###)
- Desarrolla los puntos clave con profundidad y contexto histórico/económico
- Mantén un tono académico y analítico
- Incluye conclusiones significativas
- NO agregues texto fuera del resumen
- Devuelve únicamente el resumen en formato Markdown
REQUISITOS ESTRICTOS DE FORMATO MATEMÁTICO (LaTeX):
- Si el texto incluye fórmulas matemáticas o económicas, DEBES usar formato LaTeX.
- Usa bloques $$ ... $$ para ecuaciones centradas importantes.
- Usa $ ... $ para ecuaciones en línea.
- Ejemplo: La fórmula del interés compuesto es $A = P(1 + r/n)^{{nt}}$.
- NO uses bloques de código (```latex) para las fórmulas, úsalas directamente en el texto para que Pandoc las renderice.
Contenido a resumir:
{text[:20000]}
Puntos clave a incluir obligatoriamente:
{bullet_points}"""
try:
raw_summary = self.ai_provider.generate_text(summary_prompt)
except Exception as e:
self.logger.error(f"Raw summary generation failed: {e}")
raise e
# Step 3: Format with IA (using main provider instead of Gemini)
self.logger.info("Formatting summary with IA...")
format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible y compatible con Pandoc:
{raw_summary}
Instrucciones:
- Corrige cualquier error de formato Markdown
- Asegúrate de que los encabezados estén bien espaciados
- Verifica que las viñetas usen "- " correctamente
- Mantén exactamente el contenido existente
- EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave
- VERIFICA que todas las fórmulas matemáticas estén correctamente encerradas en $...$ (inline) o $$...$$ (display)
- NO alteres la sintaxis LaTeX dentro de los delimitadores $...$ o $$...$$
- Devuelve únicamente el resumen formateado sin texto adicional"""
try:
# Use the main provider (Claude/GLM) for formatting too
if self.ai_provider.is_available():
summary = self.ai_provider.generate_text(format_prompt)
else:
self.logger.warning(
"AI provider not available for formatting, using raw summary"
)
summary = raw_summary
except Exception as e:
self.logger.warning(f"Formatting failed ({e}), using raw summary")
summary = raw_summary
# Generate filename
filename = self._generate_filename(text, summary)
# Create document
markdown_path = self._create_markdown(summary, base_name)
docx_path = None
try:
docx_path = self._create_docx(markdown_path, base_name)
except Exception as e:
self.logger.error(f"Failed to create DOCX (non-critical): {e}")
pdf_path = None
try:
# Sanitize LaTeX before PDF generation
self._sanitize_latex(markdown_path)
pdf_path = self._create_pdf(markdown_path, base_name)
except Exception as e:
self.logger.error(f"Failed to create PDF (non-critical): {e}")
# Upload to Notion if configured
from services.notion_service import notion_service
notion_uploaded = False
notion_page_id = None
if settings.has_notion_config:
try:
title = base_name.replace("_", " ").title()
# Crear página con el contenido completo del resumen
notion_metadata = {
"file_type": "Audio", # O 'PDF' dependiendo del origen
"pdf_path": pdf_path if pdf_path else Path(""),
"add_status": False, # No usar Status/Tipo (no existen en la DB)
"use_as_page": False, # Usar como database, no página
}
notion_page_id = notion_service.create_page_with_summary(
title=title, summary=summary, metadata=notion_metadata
)
if notion_page_id:
notion_uploaded = True
self.logger.info(
f"✅ Resumen subido a Notion: {title} (ID: {notion_page_id})"
)
else:
self.logger.warning(f"⚠️ No se pudo subir a Notion: {title}")
except Exception as e:
self.logger.warning(f"❌ Error al subir a Notion: {e}")
import traceback
traceback.print_exc()
else:
self.logger.info("Notion not configured - skipping upload")
metadata = {
"markdown_path": str(markdown_path),
"docx_path": str(docx_path) if docx_path else "",
"pdf_path": str(pdf_path) if pdf_path else "",
"docx_name": Path(docx_path).name if docx_path else "",
"summary": summary,
"filename": filename,
"notion_uploaded": notion_uploaded,
"notion_page_id": notion_page_id,
}
return True, summary, metadata
except Exception as e:
self.logger.error(f"Document generation process failed: {e}")
return False, "", {}
def _sanitize_latex(self, markdown_path: Path) -> None:
"""Sanitize LaTeX syntax in Markdown file to prevent Pandoc errors"""
try:
content = markdown_path.read_text(encoding="utf-8")
# 1. Unescape escaped dollar signs which are common LLM errors for math
content = content.replace(r"\$", "$")
# 2. Fix common Cyrillic and Greek characters that sneak in via LLMs
replacements = {
"ч": "ch",
"в": "v",
"к": "k",
"м": "m",
"н": "n",
"т": "t",
"": "-",
"": "-",
"": '"',
"": '"',
"": "'",
"Δ": "$\\Delta$",
"δ": "$\\delta$",
"Σ": "$\\Sigma$",
"σ": "$\\sigma$",
"π": "$\\pi$",
"Π": "$\\Pi$",
"α": "$\\alpha$",
"β": "$\\beta$",
"γ": "$\\gamma$",
"θ": "$\\theta$",
"λ": "$\\lambda$",
"μ": "$\\mu$",
}
# Be careful not to double-replace already correct LaTeX
for char, repl in replacements.items():
if char in content:
# Check if it's already inside math mode would be complex,
# but for now we assume raw unicode greek chars should become latex
content = content.replace(char, repl)
markdown_path.write_text(content, encoding="utf-8")
self.logger.info(f"Sanitized LaTeX in {markdown_path}")
except Exception as e:
self.logger.warning(f"Failed to sanitize LaTeX: {e}")
def _generate_filename(self, text: str, summary: str) -> str:
"""Generate intelligent filename"""
try:
# Use AI to extract key topics
prompt = f"""Extract 2-3 key topics from this summary to create a filename.
Summary: {summary}
Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
try:
topics_text = self.ai_provider.generate_text(prompt)
except Exception:
topics_text = summary[:100]
# Simple topic extraction
topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3]
if not topics:
topics = ["documento"]
# Limit topic length
topics = [t[: settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
filename = "_".join(topics)[: settings.MAX_FILENAME_LENGTH]
return filename
except Exception as e:
self.logger.error(f"Filename generation failed: {e}")
return "documento"
def _create_markdown(self, summary: str, base_name: str) -> Path:
"""Create Markdown document"""
output_dir = settings.LOCAL_DOWNLOADS_PATH
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.md"
content = f"""# {base_name.replace("_", " ").title()}
## Resumen
{summary}
---
*Generado por CBCFacil*
"""
with open(output_path, "w", encoding="utf-8") as f:
f.write(content)
return output_path
def _create_docx(self, markdown_path: Path, base_name: str) -> Path:
"""Create DOCX document using pandoc"""
output_dir = settings.LOCAL_DOCX
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.docx"
self.logger.info(
f"Converting Markdown to DOCX: {markdown_path} -> {output_path}"
)
try:
cmd = [
"pandoc",
str(markdown_path),
"-o",
str(output_path),
"--from=markdown",
"--to=docx",
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
self.logger.info("DOCX generated successfully with pandoc")
return output_path
except subprocess.CalledProcessError as e:
self.logger.error(f"Pandoc DOCX conversion failed: {e.stderr}")
raise FileProcessingError(f"Failed to generate DOCX: {e.stderr}")
except Exception as e:
self.logger.error(f"Error generating DOCX: {e}")
raise FileProcessingError(f"Error generating DOCX: {e}")
def _create_pdf(self, markdown_path: Path, base_name: str) -> Path:
"""Create PDF document using pandoc and pdflatex"""
output_dir = settings.LOCAL_DOWNLOADS_PATH
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.pdf"
self.logger.info(
f"Converting Markdown to PDF: {markdown_path} -> {output_path}"
)
try:
cmd = [
"pandoc",
str(markdown_path),
"-o",
str(output_path),
"--pdf-engine=pdflatex",
"-V",
"geometry:margin=2.5cm",
"-V",
"fontsize=12pt",
"--highlight-style=tango",
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
self.logger.info("PDF generated successfully with pandoc")
return output_path
except subprocess.CalledProcessError as e:
self.logger.error(f"Pandoc PDF conversion failed: {e.stderr}")
raise FileProcessingError(f"Failed to generate PDF: {e.stderr}")
except Exception as e:
self.logger.error(f"Error generating PDF: {e}")
raise FileProcessingError(f"Error generating PDF: {e}")