feat: Implementación de Resúmenes Matemáticos con LaTeX y Pandoc

##  Novedades
- **Soporte LaTeX**: Generación de PDFs y DOCX con fórmulas matemáticas renderizadas correctamente usando Pandoc.
- **Sanitización Automática**: Corrección de caracteres Unicode (griegos/cirílicos) y sintaxis LaTeX para evitar errores de compilación.
- **GLM/Claude Prioritario**: Cambio de proveedor de IA predeterminado a Claude/GLM para mayor estabilidad y capacidad de razonamiento.
- **Mejoras en Formato**: El formateo final del resumen ahora usa el modelo principal (GLM) en lugar de Gemini para consistencia.

## 🛠️ Cambios Técnicos
- `document/generators.py`: Reemplazo de generación manual por `pandoc`. Añadida función `_sanitize_latex`.
- `services/ai/claude_provider.py`: Soporte mejorado para variables de entorno de Z.ai.
- `services/ai/provider_factory.py`: Prioridad ajustada `Claude > Gemini`.
- `latex/`: Añadida documentación de referencia para el pipeline LaTeX.
This commit is contained in:
renato97
2026-01-26 23:40:16 +00:00
parent f9d245a58e
commit 915f827305
4 changed files with 384 additions and 178 deletions

View File

@@ -3,6 +3,7 @@ Document generation utilities
"""
import logging
import subprocess
import re
from pathlib import Path
from typing import Dict, Any, List, Tuple
@@ -49,17 +50,24 @@ Texto:
# Step 2: Generate Unified Summary
self.logger.info("Generating unified summary...")
summary_prompt = f"""Eres un profesor universitario experto en historia del siglo XX. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.
summary_prompt = f"""Eres un profesor universitario experto en historia y economía. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.
REQUISITOS ESTRICTOS:
REQUISITOS ESTRICTOS DE CONTENIDO:
- Extensión entre 500-700 palabras
- Usa encabezados Markdown con jerarquía clara (##, ###)
- Desarrolla los puntos clave con profundidad y contexto histórico
- Desarrolla los puntos clave con profundidad y contexto histórico/económico
- Mantén un tono académico y analítico
- Incluye conclusiones significativas
- NO agregues texto fuera del resumen
- Devuelve únicamente el resumen en formato Markdown
REQUISITOS ESTRICTOS DE FORMATO MATEMÁTICO (LaTeX):
- Si el texto incluye fórmulas matemáticas o económicas, DEBES usar formato LaTeX.
- Usa bloques $$ ... $$ para ecuaciones centradas importantes.
- Usa $ ... $ para ecuaciones en línea.
- Ejemplo: La fórmula del interés compuesto es $A = P(1 + r/n)^{{nt}}$.
- NO uses bloques de código (```latex) para las fórmulas, úsalas directamente en el texto para que Pandoc las renderice.
Contenido a resumir:
{text[:20000]}
@@ -72,31 +80,29 @@ Puntos clave a incluir obligatoriamente:
self.logger.error(f"Raw summary generation failed: {e}")
raise e
# Step 3: Format with Gemini (using GeminiProvider explicitly)
self.logger.info("Formatting summary with Gemini...")
format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible:
# Step 3: Format with IA (using main provider instead of Gemini)
self.logger.info("Formatting summary with IA...")
format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible y compatible con Pandoc:
{raw_summary}
Instrucciones:
- Corrige cualquier error de formato
- Corrige cualquier error de formato Markdown
- Asegúrate de que los encabezados estén bien espaciados
- Verifica que las viñetas usen "- " correctamente
- Mantén exactamente el contenido existente
- EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave
- VERIFICA que todas las fórmulas matemáticas estén correctamente encerradas en $...$ (inline) o $$...$$ (display)
- NO alteres la sintaxis LaTeX dentro de los delimitadores $...$ o $$...$$
- Devuelve únicamente el resumen formateado sin texto adicional"""
# Use generic Gemini provider for formatting as requested
from services.ai.gemini_provider import GeminiProvider
formatter = GeminiProvider()
try:
if formatter.is_available():
summary = formatter.generate_text(format_prompt)
# Use the main provider (Claude/GLM) for formatting too
if self.ai_provider.is_available():
summary = self.ai_provider.generate_text(format_prompt)
else:
self.logger.warning(
"Gemini formatter not available, using raw summary"
"AI provider not available for formatting, using raw summary"
)
summary = raw_summary
except Exception as e:
@@ -108,8 +114,20 @@ Instrucciones:
# Create document
markdown_path = self._create_markdown(summary, base_name)
docx_path = self._create_docx(summary, base_name)
pdf_path = self._create_pdf(summary, base_name)
docx_path = None
try:
docx_path = self._create_docx(markdown_path, base_name)
except Exception as e:
self.logger.error(f"Failed to create DOCX (non-critical): {e}")
pdf_path = None
try:
# Sanitize LaTeX before PDF generation
self._sanitize_latex(markdown_path)
pdf_path = self._create_pdf(markdown_path, base_name)
except Exception as e:
self.logger.error(f"Failed to create PDF (non-critical): {e}")
# Upload to Notion if configured
from services.notion_service import notion_service
@@ -123,7 +141,7 @@ Instrucciones:
# Crear página con el contenido completo del resumen
notion_metadata = {
"file_type": "Audio", # O 'PDF' dependiendo del origen
"pdf_path": pdf_path,
"pdf_path": pdf_path if pdf_path else Path(""),
"add_status": False, # No usar Status/Tipo (no existen en la DB)
"use_as_page": False, # Usar como database, no página
}
@@ -149,9 +167,9 @@ Instrucciones:
metadata = {
"markdown_path": str(markdown_path),
"docx_path": str(docx_path),
"pdf_path": str(pdf_path),
"docx_name": Path(docx_path).name,
"docx_path": str(docx_path) if docx_path else "",
"pdf_path": str(pdf_path) if pdf_path else "",
"docx_name": Path(docx_path).name if docx_path else "",
"summary": summary,
"filename": filename,
"notion_uploaded": notion_uploaded,
@@ -164,6 +182,53 @@ Instrucciones:
self.logger.error(f"Document generation process failed: {e}")
return False, "", {}
def _sanitize_latex(self, markdown_path: Path) -> None:
"""Sanitize LaTeX syntax in Markdown file to prevent Pandoc errors"""
try:
content = markdown_path.read_text(encoding="utf-8")
# 1. Unescape escaped dollar signs which are common LLM errors for math
content = content.replace(r"\$", "$")
# 2. Fix common Cyrillic and Greek characters that sneak in via LLMs
replacements = {
"ч": "ch",
"в": "v",
"к": "k",
"м": "m",
"н": "n",
"т": "t",
"": "-",
"": "-",
"": '"',
"": '"',
"": "'",
"Δ": "$\\Delta$",
"δ": "$\\delta$",
"Σ": "$\\Sigma$",
"σ": "$\\sigma$",
"π": "$\\pi$",
"Π": "$\\Pi$",
"α": "$\\alpha$",
"β": "$\\beta$",
"γ": "$\\gamma$",
"θ": "$\\theta$",
"λ": "$\\lambda$",
"μ": "$\\mu$",
}
# Be careful not to double-replace already correct LaTeX
for char, repl in replacements.items():
if char in content:
# Check if it's already inside math mode would be complex,
# but for now we assume raw unicode greek chars should become latex
content = content.replace(char, repl)
markdown_path.write_text(content, encoding="utf-8")
self.logger.info(f"Sanitized LaTeX in {markdown_path}")
except Exception as e:
self.logger.warning(f"Failed to sanitize LaTeX: {e}")
def _generate_filename(self, text: str, summary: str) -> str:
"""Generate intelligent filename"""
try:
@@ -173,11 +238,10 @@ Summary: {summary}
Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
topics_text = (
self.ai_provider.sanitize_input(prompt)
if hasattr(self.ai_provider, "sanitize_input")
else summary[:100]
)
try:
topics_text = self.ai_provider.generate_text(prompt)
except Exception:
topics_text = summary[:100]
# Simple topic extraction
topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3]
@@ -192,7 +256,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
except Exception as e:
self.logger.error(f"Filename generation failed: {e}")
return base_name[: settings.MAX_FILENAME_BASE_LENGTH]
return "documento"
def _create_markdown(self, summary: str, base_name: str) -> Path:
"""Create Markdown document"""
@@ -217,154 +281,72 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
return output_path
def _create_docx(self, summary: str, base_name: str) -> Path:
"""Create DOCX document with Markdown parsing (Legacy method ported)"""
try:
from docx import Document
from docx.shared import Inches
except ImportError:
raise FileProcessingError("python-docx not installed")
def _create_docx(self, markdown_path: Path, base_name: str) -> Path:
"""Create DOCX document using pandoc"""
output_dir = settings.LOCAL_DOCX
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.docx"
doc = Document()
doc.add_heading(base_name.replace("_", " ").title(), 0)
self.logger.info(
f"Converting Markdown to DOCX: {markdown_path} -> {output_path}"
)
# Parse and render Markdown content line by line
lines = summary.splitlines()
current_paragraph = []
for line in lines:
line = line.strip()
if not line:
if current_paragraph:
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3 # JUSTIFY alignment (WD_ALIGN_PARAGRAPH.JUSTIFY=3)
current_paragraph = []
continue
if line.startswith("#"):
if current_paragraph:
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
current_paragraph = []
# Process heading
level = len(line) - len(line.lstrip("#"))
heading_text = line.lstrip("#").strip()
if level <= 6:
doc.add_heading(heading_text, level=level)
else:
current_paragraph.append(heading_text)
elif line.startswith("-") or line.startswith("*") or line.startswith(""):
if current_paragraph:
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
current_paragraph = []
bullet_text = line.lstrip("-*• ").strip()
p = doc.add_paragraph(bullet_text, style="List Bullet")
# Remove bold markers from bullets if present
if "**" in bullet_text:
# Basic cleanup for bullets
pass
else:
# Clean up excessive bold markers in body text if user requested
clean_line = line.replace(
"**", ""
) # Removing asterisks as per user complaint "se abusa de los asteriscos"
current_paragraph.append(clean_line)
if current_paragraph:
p = doc.add_paragraph(" ".join(current_paragraph))
p.alignment = 3
doc.add_page_break()
doc.add_paragraph(f"*Generado por CBCFacil*")
doc.save(output_path)
return output_path
def _create_pdf(self, summary: str, base_name: str) -> Path:
"""Create PDF document with Markdown parsing (Legacy method ported)"""
try:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import textwrap
except ImportError:
raise FileProcessingError("reportlab not installed")
cmd = [
"pandoc",
str(markdown_path),
"-o",
str(output_path),
"--from=markdown",
"--to=docx",
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
self.logger.info("DOCX generated successfully with pandoc")
return output_path
except subprocess.CalledProcessError as e:
self.logger.error(f"Pandoc DOCX conversion failed: {e.stderr}")
raise FileProcessingError(f"Failed to generate DOCX: {e.stderr}")
except Exception as e:
self.logger.error(f"Error generating DOCX: {e}")
raise FileProcessingError(f"Error generating DOCX: {e}")
def _create_pdf(self, markdown_path: Path, base_name: str) -> Path:
"""Create PDF document using pandoc and pdflatex"""
output_dir = settings.LOCAL_DOWNLOADS_PATH
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.pdf"
c = canvas.Canvas(str(output_path), pagesize=letter)
width, height = letter
margin = 72
y_position = height - margin
self.logger.info(
f"Converting Markdown to PDF: {markdown_path} -> {output_path}"
)
def new_page():
nonlocal y_position
c.showPage()
c.setFont("Helvetica", 11)
y_position = height - margin
try:
cmd = [
"pandoc",
str(markdown_path),
"-o",
str(output_path),
"--pdf-engine=pdflatex",
"-V",
"geometry:margin=2.5cm",
"-V",
"fontsize=12pt",
"--highlight-style=tango",
]
c.setFont("Helvetica", 11)
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
# Title
c.setFont("Helvetica-Bold", 16)
c.drawString(margin, y_position, base_name.replace("_", " ").title()[:100])
y_position -= 28
c.setFont("Helvetica", 11)
self.logger.info("PDF generated successfully with pandoc")
return output_path
summary_clean = summary.replace(
"**", ""
) # Remove asterisks globally for cleaner PDF
for raw_line in summary_clean.splitlines():
line = raw_line.rstrip()
if not line.strip():
y_position -= 14
if y_position < margin:
new_page()
continue
stripped = line.lstrip()
if stripped.startswith("#"):
level = len(stripped) - len(stripped.lstrip("#"))
heading_text = stripped.lstrip("#").strip()
if heading_text:
font_size = 16 if level == 1 else 14 if level == 2 else 12
c.setFont("Helvetica-Bold", font_size)
c.drawString(margin, y_position, heading_text[:90])
y_position -= font_size + 6
if y_position < margin:
new_page()
c.setFont("Helvetica", 11)
continue
if stripped.startswith(("-", "*", "")):
bullet_text = stripped.lstrip("-*•").strip()
wrapped_lines = textwrap.wrap(bullet_text, width=80) or [""]
for idx, wrapped in enumerate(wrapped_lines):
prefix = "" if idx == 0 else " "
c.drawString(margin, y_position, f"{prefix}{wrapped}")
y_position -= 14
if y_position < margin:
new_page()
continue
# Body text - Justified approximation (ReportLab native justification requires Paragraph styles, defaulting to wrap)
wrapped_lines = textwrap.wrap(stripped, width=90) or [""]
for wrapped in wrapped_lines:
c.drawString(margin, y_position, wrapped)
y_position -= 14
if y_position < margin:
new_page()
c.save()
return output_path
except subprocess.CalledProcessError as e:
self.logger.error(f"Pandoc PDF conversion failed: {e.stderr}")
raise FileProcessingError(f"Failed to generate PDF: {e.stderr}")
except Exception as e:
self.logger.error(f"Error generating PDF: {e}")
raise FileProcessingError(f"Error generating PDF: {e}")

View File

@@ -0,0 +1,207 @@
# Pipeline de Generación de Resúmenes Matemáticos (LaTeX -> PDF)
Este documento contiene un script genérico en Python diseñado para integrarse en pipelines de automatización (GitHub Actions, Jenkins, GitLab CI). El script toma un archivo de texto plano, genera un resumen académico con fórmulas matemáticas usando LLMs (MiniMax, GLM, Gemini) y lo compila a PDF preservando la notación LaTeX.
## 1. Requisitos del Sistema
El entorno donde se ejecute este script debe tener instalado:
- **Python 3.8+**
- **Pandoc** (para conversión de documentos)
- **PDFLaTeX** (generalmente parte de TexLive, para renderizar fórmulas)
### Instalación en Debian/Ubuntu (Docker o CI)
```bash
apt-get update && apt-get install -y pandoc texlive-latex-base texlive-fonts-recommended python3-pip
pip install requests
```
## 2. Script Genérico (`math_summary.py`)
Guarda el siguiente código como `math_summary.py`. Este script es agnóstico al proveedor y se configura mediante argumentos o variables de entorno.
```python
#!/usr/bin/env python3
import os
import sys
import argparse
import subprocess
import requests
import json
# Configuración de Modelos
PROVIDERS = {
"minimax": {
"url": "https://api.minimax.io/anthropic/v1/messages",
"model": "MiniMax-M2",
"header_key": "x-api-key",
"version_header": {"anthropic-version": "2023-06-01"},
"env_var": "MINIMAX_API_KEY"
},
"glm": {
"url": "https://api.z.ai/api/anthropic/v1/messages",
"model": "glm-4.7",
"header_key": "x-api-key",
"version_header": {"anthropic-version": "2023-06-01"},
"env_var": "GLM_API_KEY"
}
}
PROMPT_SYSTEM = """
Eres un asistente académico experto en matemáticas y economía.
Tu tarea es resumir el texto proporcionado manteniendo el rigor científico.
REGLAS DE FORMATO (CRÍTICO):
1. La salida debe ser Markdown válido.
2. TODAS las fórmulas matemáticas deben estar en formato LaTeX.
3. Usa bloques $$ ... $$ para ecuaciones centradas importantes.
4. Usa $ ... $ para ecuaciones en línea.
5. NO uses bloques de código (```latex) para las fórmulas, úsalas directamente en el texto para que Pandoc las renderice.
6. Incluye una sección de 'Conceptos Matemáticos' con las fórmulas desglosadas.
"""
def get_api_key(provider):
env_var = PROVIDERS[provider]["env_var"]
key = os.getenv(env_var)
if not key:
print(f"Error: La variable de entorno {env_var} no está definida.")
sys.exit(1)
return key
def call_llm(provider, text, api_key):
print(f"--- Contactando API: {provider.upper()} ---")
config = PROVIDERS[provider]
headers = {
"Content-Type": "application/json",
config["header_key"]: api_key,
}
if "version_header" in config:
headers.update(config["version_header"])
payload = {
"model": config["model"],
"max_tokens": 4096,
"messages": [
{"role": "user", "content": f"{PROMPT_SYSTEM}\n\nTEXTO A RESUMIR:\n{text}"}
]
}
try:
resp = requests.post(config["url"], json=payload, headers=headers, timeout=120)
resp.raise_for_status()
data = resp.json()
# Manejo específico para MiniMax que puede devolver bloques de "thinking"
content = ""
for part in data.get("content", []):
if part.get("type") == "text":
content += part.get("text", "")
# Fallback si no hay tipo explícito (GLM estándar)
if not content and data.get("content"):
if isinstance(data["content"], list):
content = data["content"][0].get("text", "")
return content
except Exception as e:
print(f"Error llamando a {provider}: {e}")
return None
def convert_to_pdf(markdown_content, output_file):
base_name = os.path.splitext(output_file)[0]
md_file = f"{base_name}.md"
with open(md_file, "w", encoding="utf-8") as f:
f.write(markdown_content)
print(f"--- Generando PDF: {output_file} ---")
cmd = [
"pandoc", md_file,
"-o", output_file,
"--pdf-engine=pdflatex",
"-V", "geometry:margin=2.5cm",
"-V", "fontsize=12pt",
"--highlight-style=tango"
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
print("Éxito: PDF generado correctamente.")
return True
else:
print("Error en Pandoc:")
print(result.stderr)
return False
def main():
parser = argparse.ArgumentParser(description="Generador de Resúmenes Matemáticos PDF")
parser.add_argument("input_file", help="Ruta al archivo de texto (.txt) fuente")
parser.add_argument("--provider", choices=["minimax", "glm"], default="glm", help="Proveedor de IA a usar")
parser.add_argument("--output", default="resumen_output.pdf", help="Nombre del archivo PDF de salida")
args = parser.parse_args()
if not os.path.exists(args.input_file):
print(f"Error: No se encuentra el archivo {args.input_file}")
sys.exit(1)
with open(args.input_file, "r", encoding="utf-8") as f:
text_content = f.read()
api_key = get_api_key(args.provider)
summary_md = call_llm(args.provider, text_content, api_key)
if summary_md:
convert_to_pdf(summary_md, args.output)
else:
print("Fallo en la generación del resumen.")
sys.exit(1)
if __name__ == "__main__":
main()
```
## 3. Ejemplo de Uso en Pipeline
### Ejecución Local
```bash
export GLM_API_KEY="tu_api_key_aqui"
python3 math_summary.py entrada.txt --provider glm --output reporte_final.pdf
```
### GitHub Actions (Ejemplo .yaml)
Este paso automatizaría la creación del PDF cada vez que se sube un .txt a la carpeta `docs/`.
```yaml
name: Generar PDF Matemático
on:
push:
paths:
- 'docs/*.txt'
jobs:
build-pdf:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Instalar dependencias
run: |
sudo apt-get update
sudo apt-get install -y pandoc texlive-latex-base texlive-fonts-recommended
pip install requests
- name: Generar Resumen
env:
GLM_API_KEY: ${{ secrets.GLM_API_KEY }}
run: |
python3 math_summary.py docs/archivo.txt --provider glm --output docs/resumen.pdf
- name: Subir Artefacto
uses: actions/upload-artifact@v3
with:
name: PDF-Resumen
path: docs/resumen.pdf
```

View File

@@ -1,6 +1,7 @@
"""
Claude AI Provider implementation
"""
import logging
import subprocess
import shutil
@@ -30,11 +31,25 @@ class ClaudeProvider(AIProvider):
def _get_env(self) -> Dict[str, str]:
"""Get environment variables for Claude"""
env = {
'ANTHROPIC_AUTH_TOKEN': self._token,
'ANTHROPIC_BASE_URL': self._base_url,
'PYTHONUNBUFFERED': '1'
}
# Load all user environment variables first
import os
env = os.environ.copy()
# Override with our specific settings if available
if self._token:
env["ANTHROPIC_AUTH_TOKEN"] = self._token
if self._base_url:
env["ANTHROPIC_BASE_URL"] = self._base_url
# Add critical flags
env["PYTHONUNBUFFERED"] = "1"
# Ensure model variables are picked up from env (already in os.environ)
# but if we had explicit settings for them, we'd set them here.
# Since we put them in .env and loaded via load_dotenv -> os.environ,
# simply copying os.environ is sufficient.
return env
def _run_cli(self, prompt: str, timeout: int = 300) -> str:
@@ -51,7 +66,7 @@ class ClaudeProvider(AIProvider):
text=True,
capture_output=True,
timeout=timeout,
shell=False
shell=False,
)
if process.returncode != 0:
@@ -84,7 +99,12 @@ Return only the corrected text, nothing else."""
def classify_content(self, text: str, **kwargs) -> Dict[str, Any]:
"""Classify content using Claude"""
categories = ["historia", "analisis_contable", "instituciones_gobierno", "otras_clases"]
categories = [
"historia",
"analisis_contable",
"instituciones_gobierno",
"otras_clases",
]
prompt = f"""Classify the following text into one of these categories:
- historia
@@ -101,11 +121,7 @@ Return only the category name, nothing else."""
if result not in categories:
result = "otras_clases"
return {
"category": result,
"confidence": 0.9,
"provider": self.name
}
return {"category": result, "confidence": 0.9, "provider": self.name}
def generate_text(self, prompt: str, **kwargs) -> str:
"""Generate text using Claude"""

View File

@@ -1,6 +1,7 @@
"""
AI Provider Factory (Factory Pattern)
"""
import logging
from typing import Dict, Type
@@ -16,11 +17,11 @@ class AIProviderFactory:
def __init__(self):
self.logger = logging.getLogger(__name__)
self._providers: Dict[str, AIProvider] = {
'claude': ClaudeProvider(),
'gemini': GeminiProvider()
"claude": ClaudeProvider(),
"gemini": GeminiProvider(),
}
def get_provider(self, preferred: str = 'gemini') -> AIProvider:
def get_provider(self, preferred: str = "gemini") -> AIProvider:
"""Get available provider with fallback"""
# Try preferred provider first
if preferred in self._providers:
@@ -46,8 +47,8 @@ class AIProviderFactory:
}
def get_best_provider(self) -> AIProvider:
"""Get the best available provider (Gemini > Claude)"""
return self.get_provider('gemini')
"""Get the best available provider (Claude > Gemini)"""
return self.get_provider("claude")
# Global instance