Files
cbc2027/document/generators.py
renato97 dcf887c510 feat: Sistema LaTeX mejorado con sanitización automática y corrección de TikZ
Cambios principales:

## Nuevos archivos
- services/ai/parallel_provider.py: Ejecución paralela de múltiples proveedores AI
- services/ai/prompt_manager.py: Gestión centralizada de prompts (resumen.md como fuente)
- latex/resumen.md: Template del prompt para resúmenes académicos LaTeX

## Mejoras en generación LaTeX (document/generators.py)
- Nueva función _sanitize_latex(): Corrige automáticamente errores comunes de AI
  - Agrega align=center a nodos TikZ con saltos de línea (\\)
  - Previene errores 'Not allowed in LR mode' antes de compilar
- Soporte para procesamiento paralelo de proveedores AI
- Conversión DOCX en paralelo con generación PDF
- Uploads a Notion en background (non-blocking)
- Callbacks de notificación para progreso en Telegram

## Mejoras en proveedores AI
- claude_provider.py: fix_latex() con instrucciones específicas para errores TikZ
- gemini_provider.py: fix_latex() mejorado + rate limiting + circuit breaker
- provider_factory.py: Soporte para parallel provider

## Otros cambios
- config/settings.py: Nuevas configuraciones para Gemini models
- services/webdav_service.py: Mejoras en manejo de conexión
- .gitignore: Ignora archivos LaTeX auxiliares (.aux, .toc, .out, .pdf)

## Archivos de ejemplo
- latex/imperio_romano.tex, latex/clase_revolucion_rusa_crisis_30.tex
- resumen_curiosidades.tex (corregido y compilado exitosamente)
2026-02-07 20:50:27 +00:00

670 lines
25 KiB
Python

"""
Document generation utilities - LaTeX Academic Summary System
This module generates comprehensive academic summaries in LaTeX format
following the specifications in latex/resumen.md (the SINGLE SOURCE OF TRUTH).
Parallel Processing: Uses multiple agents for accelerated summary generation:
- AI Provider Racing: Multiple AI providers generate in parallel
- Parallel Format Conversion: PDF + DOCX generated simultaneously
- Background Notion Uploads: Non-blocking uploads to Notion
"""
import logging
import subprocess
import shutil
import re
import threading
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, Callable
from concurrent.futures import ThreadPoolExecutor, as_completed
from core import FileProcessingError
from config import settings
from services.ai import ai_provider_factory
from services.ai.prompt_manager import prompt_manager
def _sanitize_latex(latex_code: str) -> str:
"""
Pre-process LaTeX code to fix common errors before compilation.
This function applies automated fixes for known issues that AI models
frequently generate, reducing the need for fix_latex() iterations.
Currently handles:
- TikZ nodes with line breaks (\\\\) missing align=center
- Unbalanced environments (best effort)
"""
if not latex_code:
return latex_code
result = latex_code
# Fix TikZ nodes with \\\\ but missing align=center
# Pattern: \node[...] (name) {Text\\More};
# This is a common AI error - TikZ requires align=center for \\\\ in nodes
# We need to find \node commands and add align=center if they have \\\\ in content
# but don't already have align= in their options
def fix_tikz_node(match):
"""Fix a single TikZ node by adding align=center if needed"""
full_match = match.group(0)
options = match.group(1) # Content inside [...]
rest = match.group(2) # Everything after options
# Check if this node has \\\\ in its content (text between { })
# and doesn't already have align=
if "\\\\" in rest and "align=" not in options:
# Add align=center to the options
if options.strip():
new_options = options.rstrip() + ", align=center"
else:
new_options = "align=center"
return f"\\node[{new_options}]{rest}"
return full_match
# Match \node[options] followed by rest of the line
# Capture options and the rest separately
tikz_node_pattern = r"\\node\[([^\]]*)\]([^;]*;)"
result = re.sub(tikz_node_pattern, fix_tikz_node, result)
return result
class DocumentGenerator:
"""
Generates academic summary documents in LaTeX format.
The system follows these principles:
1. latex/resumen.md is the SINGLE SOURCE OF TRUTH for prompt structure
2. Generates full LaTeX documents (not Markdown)
3. Compiles to PDF using pdflatex
4. Supports iterative fixing with AI if compilation fails
5. Supports progress notifications via callback
"""
def __init__(self, notification_callback: Optional[Callable[[str], None]] = None):
"""
Initialize DocumentGenerator.
Args:
notification_callback: Optional callback function for progress notifications
Takes a single string argument (message to send)
"""
self.logger = logging.getLogger(__name__)
self.ai_provider = ai_provider_factory.get_best_provider()
self.notification_callback = notification_callback
self.use_parallel = ai_provider_factory.use_parallel()
self.executor = ThreadPoolExecutor(max_workers=4)
# Ensure output directories exist
settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True)
settings.LOCAL_DOCX.mkdir(parents=True, exist_ok=True)
if self.use_parallel:
self.logger.info(
"🚀 Parallel processing enabled: Multiple AI providers available"
)
def _notify(self, message: str) -> None:
"""Send notification if callback is configured"""
if self.notification_callback:
try:
self.notification_callback(message)
except Exception as e:
self.logger.warning(f"Failed to send notification: {e}")
def _generate_with_parallel_provider(self, prompt: str, **kwargs) -> str:
"""
Generate content using multiple AI providers in parallel.
Races multiple providers and returns the first successful response,
or the best quality response if using consensus strategy.
"""
try:
parallel_provider = ai_provider_factory.get_parallel_provider(max_workers=4)
self.logger.info("🚀 Using parallel AI provider (race mode)")
result = parallel_provider.generate_parallel(
prompt=prompt,
strategy="race", # Use first successful response
timeout_ms=300000, # 5 minutes
**kwargs,
)
self.logger.info(
f"✅ Parallel generation complete: {result.selected_provider} selected, "
f"{result.total_duration_ms}ms"
)
return result.content
except Exception as e:
self.logger.warning(
f"⚠️ Parallel generation failed: {e}, falling back to single provider"
)
return self.ai_provider.generate_text(prompt, **kwargs)
def _convert_formats_parallel(
self, tex_path: Path, pdf_path: Optional[Path], base_name: str
) -> Optional[Path]:
"""
Convert to multiple formats in parallel (DOCX, optionally PDF).
If PDF is already compiled, only DOCX is generated.
Otherwise, both PDF and DOCX are generated in parallel.
"""
futures = {}
# Generate DOCX
if shutil.which("pandoc"):
futures["docx"] = self.executor.submit(
self._convert_tex_to_docx, tex_path, base_name
)
# Wait for DOCX completion
docx_path = None
if "docx" in futures:
try:
docx_path = futures["docx"].result(timeout=60)
if docx_path:
self.logger.info(f"✅ Parallel DOCX generated: {docx_path}")
except Exception as e:
self.logger.warning(f"⚠️ DOCX generation failed: {e}")
return docx_path
def _upload_to_notion_background(
self,
base_name: str,
summary: str,
pdf_path: Optional[Path],
metadata: Dict[str, Any],
):
"""Upload to Notion in background thread (non-blocking)."""
def upload_worker():
try:
from services.notion_service import notion_service
title = base_name.replace("_", " ").title()
notion_metadata = {
"file_type": "Audio",
"pdf_path": pdf_path or Path(""),
"add_status": False,
"use_as_page": False,
}
page_id = notion_service.create_page_with_summary(
title=title, summary=summary, metadata=notion_metadata
)
if page_id:
metadata["notion_uploaded"] = True
metadata["notion_page_id"] = page_id
self.logger.info(
f"✅ Background upload to Notion complete: {title}"
)
else:
self.logger.warning(f"⚠️ Background Notion upload failed: {title}")
except Exception as e:
self.logger.warning(f"❌ Background Notion upload error: {e}")
# Start background thread
thread = threading.Thread(target=upload_worker, daemon=True)
thread.start()
self.logger.info("🔄 Notion upload started in background")
def generate_summary(
self,
text: str,
base_name: str,
materia: str = "Economía",
bibliographic_text: Optional[str] = None,
class_number: Optional[int] = None,
) -> Tuple[bool, str, Dict[str, Any]]:
"""
Generate comprehensive academic summary in LaTeX format.
Args:
text: The class transcription text
base_name: Base filename for output files
materia: Subject name (default: "Economía")
bibliographic_text: Optional supporting material from books/notes
class_number: Optional class number for header
Returns:
Tuple of (success, summary_text, metadata)
"""
self.logger.info(
f"🚀 Starting LaTeX academic summary generation for: {base_name}"
)
metadata = {
"filename": base_name,
"tex_path": "",
"pdf_path": "",
"markdown_path": "",
"docx_path": "",
"summary_snippet": "",
"notion_uploaded": False,
"notion_page_id": None,
"materia": materia,
}
try:
# === STEP 1: Generate LaTeX content using AI ===
self.logger.info(
"🧠 Sending request to AI Provider for LaTeX generation..."
)
self._notify("📝 Preparando prompt de resumen académico...")
prompt = prompt_manager.get_latex_summary_prompt(
transcription=text,
materia=materia,
bibliographic_text=bibliographic_text,
class_number=class_number,
)
self._notify(
"🧠 Enviando solicitud a la IA (esto puede tardar unos minutos)..."
)
# Use parallel provider if multiple AI providers are available
if self.use_parallel:
raw_response = self._generate_with_parallel_provider(prompt)
else:
raw_response = self.ai_provider.generate_text(prompt)
if not raw_response:
raise FileProcessingError("AI returned empty response")
self.logger.info(f"📝 AI response received: {len(raw_response)} characters")
self._notify(f"✅ Respuesta recibida ({len(raw_response)} caracteres)")
# === STEP 2: Extract clean LaTeX from AI response ===
self._notify("🔍 Extrayendo código LaTeX...")
latex_content = prompt_manager.extract_latex_from_response(raw_response)
if not latex_content:
self.logger.warning(
"⚠️ No valid LaTeX found in response, treating as Markdown"
)
self._notify("⚠️ No se detectó LaTeX válido, usando modo compatible...")
# Fallback to Markdown processing
return self._fallback_to_markdown(raw_response, base_name, metadata)
self.logger.info("✨ Valid LaTeX content detected")
self._notify(f"✨ LaTeX detectado: {len(latex_content)} caracteres")
# === STEP 3: Compilation Loop with Self-Correction ===
max_retries = 3
current_latex = latex_content
for attempt in range(max_retries + 1):
# Sanitize LaTeX before saving (fix common AI errors like TikZ nodes)
current_latex = _sanitize_latex(current_latex)
# Save current .tex file
self._notify(
f"📄 Guardando archivo .tex (intento {attempt + 1}/{max_retries + 1})..."
)
tex_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.tex"
tex_path.write_text(current_latex, encoding="utf-8")
metadata["tex_path"] = str(tex_path)
# Try to compile
self._notify("⚙️ Primera pasada de compilación LaTeX...")
pdf_path = self._compile_latex(
tex_path, output_dir=settings.LOCAL_DOWNLOADS_PATH
)
if pdf_path:
self.logger.info(
f"✅ Compilation success on attempt {attempt + 1}!"
)
self._notify("✅ PDF generado exitosamente!")
metadata["pdf_path"] = str(pdf_path)
# Generate DOCX in parallel
self._notify("📄 Generando archivo DOCX en paralelo...")
docx_path = self._convert_formats_parallel(
tex_path, pdf_path, base_name
)
if docx_path:
self._notify("✅ DOCX generado exitosamente!")
metadata["docx_path"] = str(docx_path)
# Create a text summary for Notion/preview
text_summary = self._create_text_summary(current_latex)
metadata["summary_snippet"] = text_summary[:500] + "..."
# Upload to Notion in background if configured
if settings.has_notion_config:
self._notify("📤 Iniciando carga a Notion en segundo plano...")
self._upload_to_notion_background(
base_name=base_name,
summary=text_summary,
pdf_path=pdf_path,
metadata=metadata,
)
self._notify("🎉 ¡Resumen completado con éxito!")
return True, text_summary, metadata
# Compilation failed - ask AI to fix
if attempt < max_retries:
self.logger.warning(
f"⚠️ Compilation failed (Attempt {attempt + 1}/{max_retries + 1}). "
f"Requesting AI fix..."
)
self._notify(
f"⚠️ Error de compilación ({attempt + 1}/{max_retries + 1}), solicitando corrección a IA..."
)
# Get error log
log_file = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.log"
error_log = "Log file not found"
if log_file.exists():
error_log = log_file.read_text(
encoding="utf-8", errors="ignore"
)[-2000:]
# Ask AI to fix
try:
self._notify("🔧 La IA está corrigiendo el código LaTeX...")
if hasattr(self.ai_provider, "fix_latex"):
fixed_latex = self.ai_provider.fix_latex(
current_latex, error_log
)
cleaned = prompt_manager.extract_latex_from_response(
fixed_latex
)
if cleaned:
current_latex = cleaned
else:
current_latex = fixed_latex
self._notify(
"✅ Código LaTeX corregido, reintentando compilación..."
)
else:
self.logger.error(
"❌ AI provider doesn't support fix_latex()"
)
break
except Exception as e:
self.logger.error(f"❌ AI fix request failed: {e}")
break
else:
self.logger.error(
"❌ Max retries reached. LaTeX compilation failed."
)
self._notify(
"❌ No se pudo compilar el LaTeX después de varios intentos"
)
# If we get here, all compilation attempts failed
self._notify("⚠️ Usando modo de compatibilidad Markdown...")
return self._fallback_to_markdown(
current_latex or raw_response, base_name, metadata
)
except Exception as e:
self.logger.error(
f"❌ Critical error in document generation: {e}", exc_info=True
)
self._notify(f"❌ Error en la generación: {str(e)[:100]}")
return False, "", metadata
def _compile_latex(self, tex_path: Path, output_dir: Path) -> Optional[Path]:
"""
Compile LaTeX to PDF using pdflatex. Runs twice for TOC.
Args:
tex_path: Path to .tex file
output_dir: Directory for output files
Returns:
Path to generated PDF or None if failed
"""
base_name = tex_path.stem
expected_pdf = output_dir / f"{base_name}.pdf"
# Check if pdflatex is available
if not shutil.which("pdflatex"):
self.logger.error("🚫 pdflatex not found in system PATH")
return None
cmd = [
"pdflatex",
"-interaction=nonstopmode",
"-halt-on-error",
f"-output-directory={output_dir}",
str(tex_path),
]
try:
# Pass 1
self.logger.info("⚙️ Compiling LaTeX (Pass 1/2)...")
subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
timeout=120,
)
# Pass 2 (for TOC resolution)
self.logger.info("⚙️ Compiling LaTeX (Pass 2/2)...")
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
timeout=120,
)
if result.returncode == 0 and expected_pdf.exists():
self.logger.info(f"✅ PDF generated: {expected_pdf}")
self._cleanup_latex_aux(output_dir, base_name)
return expected_pdf
else:
# Read log file for error info
log_file = output_dir / f"{base_name}.log"
error_snippet = "Unknown error"
if log_file.exists():
try:
log_content = log_file.read_text(
encoding="utf-8", errors="ignore"
)
errors = [
line
for line in log_content.splitlines()
if line.startswith("!")
]
if errors:
error_snippet = errors[0][:200]
except:
pass
self.logger.error(f"❌ LaTeX compilation failed: {error_snippet}")
return None
except subprocess.TimeoutExpired:
self.logger.error("❌ LaTeX compilation timed out")
return None
except Exception as e:
self.logger.error(f"❌ Error during LaTeX execution: {e}")
return None
def _convert_tex_to_docx(self, tex_path: Path, base_name: str) -> Optional[Path]:
"""Convert .tex to .docx using Pandoc."""
if not shutil.which("pandoc"):
self.logger.warning("⚠️ pandoc not found, skipping DOCX generation")
return None
docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
cmd = ["pandoc", str(tex_path), "-o", str(docx_path)]
try:
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
self.logger.info(f"✅ DOCX generated: {docx_path}")
return docx_path
except Exception as e:
self.logger.warning(f"⚠️ DOCX generation failed: {e}")
return None
def _create_text_summary(self, latex_content: str) -> str:
"""Extract a plain text summary from LaTeX content for Notion/preview."""
# Remove LaTeX commands and keep content
text = latex_content
# Remove document class and packages
text = re.sub(r"\\documentclass\[?[^\]]*\]?\{[^\}]+\}", "", text)
text = re.sub(r"\\usepackage\{[^\}]+\}", "", text)
text = re.sub(r"\\geometry\{[^\}]+\}", "", text)
text = re.sub(r"\\pagestyle\{[^\}]+\}", "", text)
text = re.sub(r"\\fancyhf\{\}", "", text)
text = re.sub(r"\\fancyhead\[?[^\]]*\]?\{[^\}]+\}", "", text)
text = re.sub(r"\\fancyfoot\[?[^\]]*\]?\{[^\}]+\}", "", text)
# Convert sections to markdown-style
text = re.sub(r"\\section\*?\{([^\}]+)\}", r"# \1", text)
text = re.sub(r"\\subsection\*?\{([^\}]+)\}", r"## \1", text)
text = re.sub(r"\\subsubsection\*?\{([^\}]+)\}", r"### \1", text)
# Remove tcolorbox environments (keep content)
text = re.sub(
r"\\begin\{(definicion|importante|ejemplo)\}\[?[^\]]*\]?",
r"\n**\1:** ",
text,
)
text = re.sub(r"\\end\{(definicion|importante|ejemplo)\}", "", text)
# Convert itemize to bullets
text = re.sub(r"\\item\s*", "- ", text)
text = re.sub(r"\\begin\{(itemize|enumerate)\}", "", text)
text = re.sub(r"\\end\{(itemize|enumerate)\}", "", text)
# Clean up math (basic)
text = re.sub(r"\$\$([^\$]+)\$\$", r"\n\n\1\n\n", text)
text = re.sub(r"\$([^\$]+)\$", r"\1", text)
# Remove remaining LaTeX commands
text = re.sub(r"\\[a-zA-Z]+(\{[^\}]*\})*", "", text)
text = re.sub(r"[{}]", "", text)
# Clean whitespace
text = re.sub(r"\n\s*\n\s*\n", "\n\n", text)
text = text.strip()
return text
def _fallback_to_markdown(
self, content: str, base_name: str, metadata: Dict[str, Any]
) -> Tuple[bool, str, Dict[str, Any]]:
"""Fallback when LaTeX generation fails."""
self.logger.warning("⚠️ Falling back to Markdown processing")
md_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_resumen.md"
md_path.write_text(content, encoding="utf-8")
metadata["markdown_path"] = str(md_path)
# Try to convert to PDF via pandoc
if shutil.which("pandoc"):
pdf_path = self._convert_md_to_pdf(md_path, base_name)
if pdf_path:
metadata["pdf_path"] = str(pdf_path)
docx_path = self._convert_md_to_docx(md_path, base_name)
if docx_path:
metadata["docx_path"] = str(docx_path)
metadata["summary_snippet"] = content[:500] + "..."
return True, content, metadata
def _convert_md_to_pdf(self, md_path: Path, base_name: str) -> Optional[Path]:
"""Convert Markdown to PDF using pandoc."""
pdf_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.pdf"
cmd = [
"pandoc",
str(md_path),
"-o",
str(pdf_path),
"--pdf-engine=pdflatex",
"-V",
"geometry:margin=2.5cm",
]
try:
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
self.logger.info(f"✅ PDF from Markdown: {pdf_path}")
return pdf_path
except Exception as e:
self.logger.warning(f"⚠️ PDF from Markdown failed: {e}")
return None
def _convert_md_to_docx(self, md_path: Path, base_name: str) -> Optional[Path]:
"""Convert Markdown to DOCX using pandoc."""
docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
cmd = ["pandoc", str(md_path), "-o", str(docx_path)]
try:
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
self.logger.info(f"✅ DOCX from Markdown: {docx_path}")
return docx_path
except Exception as e:
self.logger.warning(f"⚠️ DOCX from Markdown failed: {e}")
return None
def _cleanup_latex_aux(self, output_dir: Path, base_name: str):
"""Clean up auxiliary LaTeX files."""
extensions = [".aux", ".log", ".out", ".toc"]
for ext in extensions:
aux_file = output_dir / f"{base_name}{ext}"
if aux_file.exists():
try:
aux_file.unlink()
except:
pass
def _upload_to_notion(
self,
base_name: str,
summary: str,
pdf_path: Optional[Path],
metadata: Dict[str, Any],
):
"""Upload summary to Notion if configured."""
try:
from services.notion_service import notion_service
title = base_name.replace("_", " ").title()
notion_metadata = {
"file_type": "Audio",
"pdf_path": pdf_path or Path(""),
"add_status": False,
"use_as_page": False,
}
page_id = notion_service.create_page_with_summary(
title=title, summary=summary, metadata=notion_metadata
)
if page_id:
metadata["notion_uploaded"] = True
metadata["notion_page_id"] = page_id
self.logger.info(f"✅ Uploaded to Notion: {title}")
else:
self.logger.warning(f"⚠️ Notion upload failed: {title}")
except Exception as e:
self.logger.warning(f"❌ Notion upload error: {e}")