feat: Sistema LaTeX mejorado con sanitización automática y corrección de TikZ
Cambios principales: ## Nuevos archivos - services/ai/parallel_provider.py: Ejecución paralela de múltiples proveedores AI - services/ai/prompt_manager.py: Gestión centralizada de prompts (resumen.md como fuente) - latex/resumen.md: Template del prompt para resúmenes académicos LaTeX ## Mejoras en generación LaTeX (document/generators.py) - Nueva función _sanitize_latex(): Corrige automáticamente errores comunes de AI - Agrega align=center a nodos TikZ con saltos de línea (\\) - Previene errores 'Not allowed in LR mode' antes de compilar - Soporte para procesamiento paralelo de proveedores AI - Conversión DOCX en paralelo con generación PDF - Uploads a Notion en background (non-blocking) - Callbacks de notificación para progreso en Telegram ## Mejoras en proveedores AI - claude_provider.py: fix_latex() con instrucciones específicas para errores TikZ - gemini_provider.py: fix_latex() mejorado + rate limiting + circuit breaker - provider_factory.py: Soporte para parallel provider ## Otros cambios - config/settings.py: Nuevas configuraciones para Gemini models - services/webdav_service.py: Mejoras en manejo de conexión - .gitignore: Ignora archivos LaTeX auxiliares (.aux, .toc, .out, .pdf) ## Archivos de ejemplo - latex/imperio_romano.tex, latex/clase_revolucion_rusa_crisis_30.tex - resumen_curiosidades.tex (corregido y compilado exitosamente)
This commit is contained in:
@@ -1,352 +1,669 @@
|
||||
"""
|
||||
Document generation utilities
|
||||
Document generation utilities - LaTeX Academic Summary System
|
||||
|
||||
This module generates comprehensive academic summaries in LaTeX format
|
||||
following the specifications in latex/resumen.md (the SINGLE SOURCE OF TRUTH).
|
||||
|
||||
Parallel Processing: Uses multiple agents for accelerated summary generation:
|
||||
- AI Provider Racing: Multiple AI providers generate in parallel
|
||||
- Parallel Format Conversion: PDF + DOCX generated simultaneously
|
||||
- Background Notion Uploads: Non-blocking uploads to Notion
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import shutil
|
||||
import re
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List, Tuple
|
||||
from typing import Dict, Any, Optional, Tuple, Callable
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from core import FileProcessingError
|
||||
from config import settings
|
||||
from services.ai import ai_provider_factory
|
||||
from services.ai.prompt_manager import prompt_manager
|
||||
|
||||
|
||||
def _sanitize_latex(latex_code: str) -> str:
|
||||
"""
|
||||
Pre-process LaTeX code to fix common errors before compilation.
|
||||
|
||||
This function applies automated fixes for known issues that AI models
|
||||
frequently generate, reducing the need for fix_latex() iterations.
|
||||
|
||||
Currently handles:
|
||||
- TikZ nodes with line breaks (\\\\) missing align=center
|
||||
- Unbalanced environments (best effort)
|
||||
"""
|
||||
if not latex_code:
|
||||
return latex_code
|
||||
|
||||
result = latex_code
|
||||
|
||||
# Fix TikZ nodes with \\\\ but missing align=center
|
||||
# Pattern: \node[...] (name) {Text\\More};
|
||||
# This is a common AI error - TikZ requires align=center for \\\\ in nodes
|
||||
|
||||
# We need to find \node commands and add align=center if they have \\\\ in content
|
||||
# but don't already have align= in their options
|
||||
|
||||
def fix_tikz_node(match):
|
||||
"""Fix a single TikZ node by adding align=center if needed"""
|
||||
full_match = match.group(0)
|
||||
options = match.group(1) # Content inside [...]
|
||||
rest = match.group(2) # Everything after options
|
||||
|
||||
# Check if this node has \\\\ in its content (text between { })
|
||||
# and doesn't already have align=
|
||||
if "\\\\" in rest and "align=" not in options:
|
||||
# Add align=center to the options
|
||||
if options.strip():
|
||||
new_options = options.rstrip() + ", align=center"
|
||||
else:
|
||||
new_options = "align=center"
|
||||
return f"\\node[{new_options}]{rest}"
|
||||
|
||||
return full_match
|
||||
|
||||
# Match \node[options] followed by rest of the line
|
||||
# Capture options and the rest separately
|
||||
tikz_node_pattern = r"\\node\[([^\]]*)\]([^;]*;)"
|
||||
result = re.sub(tikz_node_pattern, fix_tikz_node, result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class DocumentGenerator:
|
||||
"""Generate documents from processed text"""
|
||||
"""
|
||||
Generates academic summary documents in LaTeX format.
|
||||
|
||||
def __init__(self):
|
||||
The system follows these principles:
|
||||
1. latex/resumen.md is the SINGLE SOURCE OF TRUTH for prompt structure
|
||||
2. Generates full LaTeX documents (not Markdown)
|
||||
3. Compiles to PDF using pdflatex
|
||||
4. Supports iterative fixing with AI if compilation fails
|
||||
5. Supports progress notifications via callback
|
||||
"""
|
||||
|
||||
def __init__(self, notification_callback: Optional[Callable[[str], None]] = None):
|
||||
"""
|
||||
Initialize DocumentGenerator.
|
||||
|
||||
Args:
|
||||
notification_callback: Optional callback function for progress notifications
|
||||
Takes a single string argument (message to send)
|
||||
"""
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.ai_provider = ai_provider_factory.get_best_provider()
|
||||
self.notification_callback = notification_callback
|
||||
self.use_parallel = ai_provider_factory.use_parallel()
|
||||
self.executor = ThreadPoolExecutor(max_workers=4)
|
||||
|
||||
# Ensure output directories exist
|
||||
settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True)
|
||||
settings.LOCAL_DOCX.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if self.use_parallel:
|
||||
self.logger.info(
|
||||
"🚀 Parallel processing enabled: Multiple AI providers available"
|
||||
)
|
||||
|
||||
def _notify(self, message: str) -> None:
|
||||
"""Send notification if callback is configured"""
|
||||
if self.notification_callback:
|
||||
try:
|
||||
self.notification_callback(message)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to send notification: {e}")
|
||||
|
||||
def _generate_with_parallel_provider(self, prompt: str, **kwargs) -> str:
|
||||
"""
|
||||
Generate content using multiple AI providers in parallel.
|
||||
|
||||
Races multiple providers and returns the first successful response,
|
||||
or the best quality response if using consensus strategy.
|
||||
"""
|
||||
try:
|
||||
parallel_provider = ai_provider_factory.get_parallel_provider(max_workers=4)
|
||||
self.logger.info("🚀 Using parallel AI provider (race mode)")
|
||||
|
||||
result = parallel_provider.generate_parallel(
|
||||
prompt=prompt,
|
||||
strategy="race", # Use first successful response
|
||||
timeout_ms=300000, # 5 minutes
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
f"✅ Parallel generation complete: {result.selected_provider} selected, "
|
||||
f"{result.total_duration_ms}ms"
|
||||
)
|
||||
|
||||
return result.content
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
f"⚠️ Parallel generation failed: {e}, falling back to single provider"
|
||||
)
|
||||
return self.ai_provider.generate_text(prompt, **kwargs)
|
||||
|
||||
def _convert_formats_parallel(
|
||||
self, tex_path: Path, pdf_path: Optional[Path], base_name: str
|
||||
) -> Optional[Path]:
|
||||
"""
|
||||
Convert to multiple formats in parallel (DOCX, optionally PDF).
|
||||
|
||||
If PDF is already compiled, only DOCX is generated.
|
||||
Otherwise, both PDF and DOCX are generated in parallel.
|
||||
"""
|
||||
futures = {}
|
||||
|
||||
# Generate DOCX
|
||||
if shutil.which("pandoc"):
|
||||
futures["docx"] = self.executor.submit(
|
||||
self._convert_tex_to_docx, tex_path, base_name
|
||||
)
|
||||
|
||||
# Wait for DOCX completion
|
||||
docx_path = None
|
||||
if "docx" in futures:
|
||||
try:
|
||||
docx_path = futures["docx"].result(timeout=60)
|
||||
if docx_path:
|
||||
self.logger.info(f"✅ Parallel DOCX generated: {docx_path}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"⚠️ DOCX generation failed: {e}")
|
||||
|
||||
return docx_path
|
||||
|
||||
def _upload_to_notion_background(
|
||||
self,
|
||||
base_name: str,
|
||||
summary: str,
|
||||
pdf_path: Optional[Path],
|
||||
metadata: Dict[str, Any],
|
||||
):
|
||||
"""Upload to Notion in background thread (non-blocking)."""
|
||||
|
||||
def upload_worker():
|
||||
try:
|
||||
from services.notion_service import notion_service
|
||||
|
||||
title = base_name.replace("_", " ").title()
|
||||
notion_metadata = {
|
||||
"file_type": "Audio",
|
||||
"pdf_path": pdf_path or Path(""),
|
||||
"add_status": False,
|
||||
"use_as_page": False,
|
||||
}
|
||||
|
||||
page_id = notion_service.create_page_with_summary(
|
||||
title=title, summary=summary, metadata=notion_metadata
|
||||
)
|
||||
|
||||
if page_id:
|
||||
metadata["notion_uploaded"] = True
|
||||
metadata["notion_page_id"] = page_id
|
||||
self.logger.info(
|
||||
f"✅ Background upload to Notion complete: {title}"
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"⚠️ Background Notion upload failed: {title}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"❌ Background Notion upload error: {e}")
|
||||
|
||||
# Start background thread
|
||||
thread = threading.Thread(target=upload_worker, daemon=True)
|
||||
thread.start()
|
||||
self.logger.info("🔄 Notion upload started in background")
|
||||
|
||||
def generate_summary(
|
||||
self, text: str, base_name: str
|
||||
self,
|
||||
text: str,
|
||||
base_name: str,
|
||||
materia: str = "Economía",
|
||||
bibliographic_text: Optional[str] = None,
|
||||
class_number: Optional[int] = None,
|
||||
) -> Tuple[bool, str, Dict[str, Any]]:
|
||||
"""Generate unified summary"""
|
||||
self.logger.info(f"Generating summary for {base_name}")
|
||||
"""
|
||||
Generate comprehensive academic summary in LaTeX format.
|
||||
|
||||
Args:
|
||||
text: The class transcription text
|
||||
base_name: Base filename for output files
|
||||
materia: Subject name (default: "Economía")
|
||||
bibliographic_text: Optional supporting material from books/notes
|
||||
class_number: Optional class number for header
|
||||
|
||||
Returns:
|
||||
Tuple of (success, summary_text, metadata)
|
||||
"""
|
||||
self.logger.info(
|
||||
f"🚀 Starting LaTeX academic summary generation for: {base_name}"
|
||||
)
|
||||
|
||||
metadata = {
|
||||
"filename": base_name,
|
||||
"tex_path": "",
|
||||
"pdf_path": "",
|
||||
"markdown_path": "",
|
||||
"docx_path": "",
|
||||
"summary_snippet": "",
|
||||
"notion_uploaded": False,
|
||||
"notion_page_id": None,
|
||||
"materia": materia,
|
||||
}
|
||||
|
||||
try:
|
||||
# Step 1: Generate Bullet Points (Chunking handled by provider or single prompt for now)
|
||||
# Note: We use the main provider (Claude/Zai) for content generation
|
||||
self.logger.info("Generating bullet points...")
|
||||
bullet_prompt = f"""Analiza el siguiente texto y extrae entre 5 y 8 bullet points clave en español.
|
||||
# === STEP 1: Generate LaTeX content using AI ===
|
||||
self.logger.info(
|
||||
"🧠 Sending request to AI Provider for LaTeX generation..."
|
||||
)
|
||||
self._notify("📝 Preparando prompt de resumen académico...")
|
||||
|
||||
REGLAS ESTRICTAS:
|
||||
1. Devuelve ÚNICAMENTE bullet points, cada línea iniciando con "- "
|
||||
2. Cada bullet debe ser conciso (12-20 palabras) y resaltar datos, fechas, conceptos o conclusiones importantes
|
||||
3. NO agregues introducciones, conclusiones ni texto explicativo
|
||||
4. Concéntrate en los puntos más importantes del texto
|
||||
5. Incluye fechas, datos específicos y nombres relevantes si los hay
|
||||
prompt = prompt_manager.get_latex_summary_prompt(
|
||||
transcription=text,
|
||||
materia=materia,
|
||||
bibliographic_text=bibliographic_text,
|
||||
class_number=class_number,
|
||||
)
|
||||
|
||||
Texto:
|
||||
{text[:15000]}""" # Truncate to avoid context limits if necessary, though providers handle it differently
|
||||
self._notify(
|
||||
"🧠 Enviando solicitud a la IA (esto puede tardar unos minutos)..."
|
||||
)
|
||||
|
||||
try:
|
||||
bullet_points = self.ai_provider.generate_text(bullet_prompt)
|
||||
self.logger.info(f"Bullet points generated: {len(bullet_points)}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Bullet point generation failed: {e}")
|
||||
bullet_points = "- Puntos clave no disponibles por error en IA"
|
||||
# Use parallel provider if multiple AI providers are available
|
||||
if self.use_parallel:
|
||||
raw_response = self._generate_with_parallel_provider(prompt)
|
||||
else:
|
||||
raw_response = self.ai_provider.generate_text(prompt)
|
||||
|
||||
# Step 2: Generate Unified Summary
|
||||
self.logger.info("Generating unified summary...")
|
||||
summary_prompt = f"""Eres un profesor universitario experto en historia y economía. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.
|
||||
if not raw_response:
|
||||
raise FileProcessingError("AI returned empty response")
|
||||
|
||||
REQUISITOS ESTRICTOS DE CONTENIDO:
|
||||
- Extensión entre 500-700 palabras
|
||||
- Usa encabezados Markdown con jerarquía clara (##, ###)
|
||||
- Desarrolla los puntos clave con profundidad y contexto histórico/económico
|
||||
- Mantén un tono académico y analítico
|
||||
- Incluye conclusiones significativas
|
||||
- NO agregues texto fuera del resumen
|
||||
- Devuelve únicamente el resumen en formato Markdown
|
||||
self.logger.info(f"📝 AI response received: {len(raw_response)} characters")
|
||||
self._notify(f"✅ Respuesta recibida ({len(raw_response)} caracteres)")
|
||||
|
||||
REQUISITOS ESTRICTOS DE FORMATO MATEMÁTICO (LaTeX):
|
||||
- Si el texto incluye fórmulas matemáticas o económicas, DEBES usar formato LaTeX.
|
||||
- Usa bloques $$ ... $$ para ecuaciones centradas importantes.
|
||||
- Usa $ ... $ para ecuaciones en línea.
|
||||
- Ejemplo: La fórmula del interés compuesto es $A = P(1 + r/n)^{{nt}}$.
|
||||
- NO uses bloques de código (```latex) para las fórmulas, úsalas directamente en el texto para que Pandoc las renderice.
|
||||
# === STEP 2: Extract clean LaTeX from AI response ===
|
||||
self._notify("🔍 Extrayendo código LaTeX...")
|
||||
|
||||
Contenido a resumir:
|
||||
{text[:20000]}
|
||||
latex_content = prompt_manager.extract_latex_from_response(raw_response)
|
||||
|
||||
Puntos clave a incluir obligatoriamente:
|
||||
{bullet_points}"""
|
||||
if not latex_content:
|
||||
self.logger.warning(
|
||||
"⚠️ No valid LaTeX found in response, treating as Markdown"
|
||||
)
|
||||
self._notify("⚠️ No se detectó LaTeX válido, usando modo compatible...")
|
||||
# Fallback to Markdown processing
|
||||
return self._fallback_to_markdown(raw_response, base_name, metadata)
|
||||
|
||||
try:
|
||||
raw_summary = self.ai_provider.generate_text(summary_prompt)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Raw summary generation failed: {e}")
|
||||
raise e
|
||||
self.logger.info("✨ Valid LaTeX content detected")
|
||||
self._notify(f"✨ LaTeX detectado: {len(latex_content)} caracteres")
|
||||
|
||||
# Step 3: Format with IA (using main provider instead of Gemini)
|
||||
self.logger.info("Formatting summary with IA...")
|
||||
format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible y compatible con Pandoc:
|
||||
# === STEP 3: Compilation Loop with Self-Correction ===
|
||||
max_retries = 3
|
||||
current_latex = latex_content
|
||||
|
||||
{raw_summary}
|
||||
for attempt in range(max_retries + 1):
|
||||
# Sanitize LaTeX before saving (fix common AI errors like TikZ nodes)
|
||||
current_latex = _sanitize_latex(current_latex)
|
||||
|
||||
Instrucciones:
|
||||
- Corrige cualquier error de formato Markdown
|
||||
- Asegúrate de que los encabezados estén bien espaciados
|
||||
- Verifica que las viñetas usen "- " correctamente
|
||||
- Mantén exactamente el contenido existente
|
||||
- EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave
|
||||
- VERIFICA que todas las fórmulas matemáticas estén correctamente encerradas en $...$ (inline) o $$...$$ (display)
|
||||
- NO alteres la sintaxis LaTeX dentro de los delimitadores $...$ o $$...$$
|
||||
- Devuelve únicamente el resumen formateado sin texto adicional"""
|
||||
# Save current .tex file
|
||||
self._notify(
|
||||
f"📄 Guardando archivo .tex (intento {attempt + 1}/{max_retries + 1})..."
|
||||
)
|
||||
|
||||
try:
|
||||
# Use the main provider (Claude/GLM) for formatting too
|
||||
if self.ai_provider.is_available():
|
||||
summary = self.ai_provider.generate_text(format_prompt)
|
||||
else:
|
||||
self.logger.warning(
|
||||
"AI provider not available for formatting, using raw summary"
|
||||
tex_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.tex"
|
||||
tex_path.write_text(current_latex, encoding="utf-8")
|
||||
metadata["tex_path"] = str(tex_path)
|
||||
|
||||
# Try to compile
|
||||
self._notify("⚙️ Primera pasada de compilación LaTeX...")
|
||||
|
||||
pdf_path = self._compile_latex(
|
||||
tex_path, output_dir=settings.LOCAL_DOWNLOADS_PATH
|
||||
)
|
||||
|
||||
if pdf_path:
|
||||
self.logger.info(
|
||||
f"✅ Compilation success on attempt {attempt + 1}!"
|
||||
)
|
||||
summary = raw_summary
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Formatting failed ({e}), using raw summary")
|
||||
summary = raw_summary
|
||||
self._notify("✅ PDF generado exitosamente!")
|
||||
metadata["pdf_path"] = str(pdf_path)
|
||||
|
||||
# Generate filename
|
||||
filename = self._generate_filename(text, summary)
|
||||
# Generate DOCX in parallel
|
||||
self._notify("📄 Generando archivo DOCX en paralelo...")
|
||||
docx_path = self._convert_formats_parallel(
|
||||
tex_path, pdf_path, base_name
|
||||
)
|
||||
if docx_path:
|
||||
self._notify("✅ DOCX generado exitosamente!")
|
||||
metadata["docx_path"] = str(docx_path)
|
||||
|
||||
# Create document
|
||||
markdown_path = self._create_markdown(summary, base_name)
|
||||
# Create a text summary for Notion/preview
|
||||
text_summary = self._create_text_summary(current_latex)
|
||||
metadata["summary_snippet"] = text_summary[:500] + "..."
|
||||
|
||||
docx_path = None
|
||||
try:
|
||||
docx_path = self._create_docx(markdown_path, base_name)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to create DOCX (non-critical): {e}")
|
||||
# Upload to Notion in background if configured
|
||||
if settings.has_notion_config:
|
||||
self._notify("📤 Iniciando carga a Notion en segundo plano...")
|
||||
self._upload_to_notion_background(
|
||||
base_name=base_name,
|
||||
summary=text_summary,
|
||||
pdf_path=pdf_path,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
pdf_path = None
|
||||
try:
|
||||
# Sanitize LaTeX before PDF generation
|
||||
self._sanitize_latex(markdown_path)
|
||||
pdf_path = self._create_pdf(markdown_path, base_name)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to create PDF (non-critical): {e}")
|
||||
self._notify("🎉 ¡Resumen completado con éxito!")
|
||||
return True, text_summary, metadata
|
||||
|
||||
# Upload to Notion if configured
|
||||
# Compilation failed - ask AI to fix
|
||||
if attempt < max_retries:
|
||||
self.logger.warning(
|
||||
f"⚠️ Compilation failed (Attempt {attempt + 1}/{max_retries + 1}). "
|
||||
f"Requesting AI fix..."
|
||||
)
|
||||
self._notify(
|
||||
f"⚠️ Error de compilación ({attempt + 1}/{max_retries + 1}), solicitando corrección a IA..."
|
||||
)
|
||||
|
||||
# Get error log
|
||||
log_file = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.log"
|
||||
error_log = "Log file not found"
|
||||
if log_file.exists():
|
||||
error_log = log_file.read_text(
|
||||
encoding="utf-8", errors="ignore"
|
||||
)[-2000:]
|
||||
|
||||
# Ask AI to fix
|
||||
try:
|
||||
self._notify("🔧 La IA está corrigiendo el código LaTeX...")
|
||||
if hasattr(self.ai_provider, "fix_latex"):
|
||||
fixed_latex = self.ai_provider.fix_latex(
|
||||
current_latex, error_log
|
||||
)
|
||||
cleaned = prompt_manager.extract_latex_from_response(
|
||||
fixed_latex
|
||||
)
|
||||
if cleaned:
|
||||
current_latex = cleaned
|
||||
else:
|
||||
current_latex = fixed_latex
|
||||
self._notify(
|
||||
"✅ Código LaTeX corregido, reintentando compilación..."
|
||||
)
|
||||
else:
|
||||
self.logger.error(
|
||||
"❌ AI provider doesn't support fix_latex()"
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.error(f"❌ AI fix request failed: {e}")
|
||||
break
|
||||
else:
|
||||
self.logger.error(
|
||||
"❌ Max retries reached. LaTeX compilation failed."
|
||||
)
|
||||
self._notify(
|
||||
"❌ No se pudo compilar el LaTeX después de varios intentos"
|
||||
)
|
||||
|
||||
# If we get here, all compilation attempts failed
|
||||
self._notify("⚠️ Usando modo de compatibilidad Markdown...")
|
||||
return self._fallback_to_markdown(
|
||||
current_latex or raw_response, base_name, metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"❌ Critical error in document generation: {e}", exc_info=True
|
||||
)
|
||||
self._notify(f"❌ Error en la generación: {str(e)[:100]}")
|
||||
return False, "", metadata
|
||||
|
||||
def _compile_latex(self, tex_path: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""
|
||||
Compile LaTeX to PDF using pdflatex. Runs twice for TOC.
|
||||
|
||||
Args:
|
||||
tex_path: Path to .tex file
|
||||
output_dir: Directory for output files
|
||||
|
||||
Returns:
|
||||
Path to generated PDF or None if failed
|
||||
"""
|
||||
base_name = tex_path.stem
|
||||
expected_pdf = output_dir / f"{base_name}.pdf"
|
||||
|
||||
# Check if pdflatex is available
|
||||
if not shutil.which("pdflatex"):
|
||||
self.logger.error("🚫 pdflatex not found in system PATH")
|
||||
return None
|
||||
|
||||
cmd = [
|
||||
"pdflatex",
|
||||
"-interaction=nonstopmode",
|
||||
"-halt-on-error",
|
||||
f"-output-directory={output_dir}",
|
||||
str(tex_path),
|
||||
]
|
||||
|
||||
try:
|
||||
# Pass 1
|
||||
self.logger.info("⚙️ Compiling LaTeX (Pass 1/2)...")
|
||||
subprocess.run(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
# Pass 2 (for TOC resolution)
|
||||
self.logger.info("⚙️ Compiling LaTeX (Pass 2/2)...")
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
if result.returncode == 0 and expected_pdf.exists():
|
||||
self.logger.info(f"✅ PDF generated: {expected_pdf}")
|
||||
self._cleanup_latex_aux(output_dir, base_name)
|
||||
return expected_pdf
|
||||
else:
|
||||
# Read log file for error info
|
||||
log_file = output_dir / f"{base_name}.log"
|
||||
error_snippet = "Unknown error"
|
||||
if log_file.exists():
|
||||
try:
|
||||
log_content = log_file.read_text(
|
||||
encoding="utf-8", errors="ignore"
|
||||
)
|
||||
errors = [
|
||||
line
|
||||
for line in log_content.splitlines()
|
||||
if line.startswith("!")
|
||||
]
|
||||
if errors:
|
||||
error_snippet = errors[0][:200]
|
||||
except:
|
||||
pass
|
||||
|
||||
self.logger.error(f"❌ LaTeX compilation failed: {error_snippet}")
|
||||
return None
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
self.logger.error("❌ LaTeX compilation timed out")
|
||||
return None
|
||||
except Exception as e:
|
||||
self.logger.error(f"❌ Error during LaTeX execution: {e}")
|
||||
return None
|
||||
|
||||
def _convert_tex_to_docx(self, tex_path: Path, base_name: str) -> Optional[Path]:
|
||||
"""Convert .tex to .docx using Pandoc."""
|
||||
if not shutil.which("pandoc"):
|
||||
self.logger.warning("⚠️ pandoc not found, skipping DOCX generation")
|
||||
return None
|
||||
|
||||
docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
|
||||
cmd = ["pandoc", str(tex_path), "-o", str(docx_path)]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
||||
self.logger.info(f"✅ DOCX generated: {docx_path}")
|
||||
return docx_path
|
||||
except Exception as e:
|
||||
self.logger.warning(f"⚠️ DOCX generation failed: {e}")
|
||||
return None
|
||||
|
||||
def _create_text_summary(self, latex_content: str) -> str:
|
||||
"""Extract a plain text summary from LaTeX content for Notion/preview."""
|
||||
# Remove LaTeX commands and keep content
|
||||
text = latex_content
|
||||
|
||||
# Remove document class and packages
|
||||
text = re.sub(r"\\documentclass\[?[^\]]*\]?\{[^\}]+\}", "", text)
|
||||
text = re.sub(r"\\usepackage\{[^\}]+\}", "", text)
|
||||
text = re.sub(r"\\geometry\{[^\}]+\}", "", text)
|
||||
text = re.sub(r"\\pagestyle\{[^\}]+\}", "", text)
|
||||
text = re.sub(r"\\fancyhf\{\}", "", text)
|
||||
text = re.sub(r"\\fancyhead\[?[^\]]*\]?\{[^\}]+\}", "", text)
|
||||
text = re.sub(r"\\fancyfoot\[?[^\]]*\]?\{[^\}]+\}", "", text)
|
||||
|
||||
# Convert sections to markdown-style
|
||||
text = re.sub(r"\\section\*?\{([^\}]+)\}", r"# \1", text)
|
||||
text = re.sub(r"\\subsection\*?\{([^\}]+)\}", r"## \1", text)
|
||||
text = re.sub(r"\\subsubsection\*?\{([^\}]+)\}", r"### \1", text)
|
||||
|
||||
# Remove tcolorbox environments (keep content)
|
||||
text = re.sub(
|
||||
r"\\begin\{(definicion|importante|ejemplo)\}\[?[^\]]*\]?",
|
||||
r"\n**\1:** ",
|
||||
text,
|
||||
)
|
||||
text = re.sub(r"\\end\{(definicion|importante|ejemplo)\}", "", text)
|
||||
|
||||
# Convert itemize to bullets
|
||||
text = re.sub(r"\\item\s*", "- ", text)
|
||||
text = re.sub(r"\\begin\{(itemize|enumerate)\}", "", text)
|
||||
text = re.sub(r"\\end\{(itemize|enumerate)\}", "", text)
|
||||
|
||||
# Clean up math (basic)
|
||||
text = re.sub(r"\$\$([^\$]+)\$\$", r"\n\n\1\n\n", text)
|
||||
text = re.sub(r"\$([^\$]+)\$", r"\1", text)
|
||||
|
||||
# Remove remaining LaTeX commands
|
||||
text = re.sub(r"\\[a-zA-Z]+(\{[^\}]*\})*", "", text)
|
||||
text = re.sub(r"[{}]", "", text)
|
||||
|
||||
# Clean whitespace
|
||||
text = re.sub(r"\n\s*\n\s*\n", "\n\n", text)
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
def _fallback_to_markdown(
|
||||
self, content: str, base_name: str, metadata: Dict[str, Any]
|
||||
) -> Tuple[bool, str, Dict[str, Any]]:
|
||||
"""Fallback when LaTeX generation fails."""
|
||||
self.logger.warning("⚠️ Falling back to Markdown processing")
|
||||
|
||||
md_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_resumen.md"
|
||||
md_path.write_text(content, encoding="utf-8")
|
||||
metadata["markdown_path"] = str(md_path)
|
||||
|
||||
# Try to convert to PDF via pandoc
|
||||
if shutil.which("pandoc"):
|
||||
pdf_path = self._convert_md_to_pdf(md_path, base_name)
|
||||
if pdf_path:
|
||||
metadata["pdf_path"] = str(pdf_path)
|
||||
|
||||
docx_path = self._convert_md_to_docx(md_path, base_name)
|
||||
if docx_path:
|
||||
metadata["docx_path"] = str(docx_path)
|
||||
|
||||
metadata["summary_snippet"] = content[:500] + "..."
|
||||
return True, content, metadata
|
||||
|
||||
def _convert_md_to_pdf(self, md_path: Path, base_name: str) -> Optional[Path]:
|
||||
"""Convert Markdown to PDF using pandoc."""
|
||||
pdf_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.pdf"
|
||||
cmd = [
|
||||
"pandoc",
|
||||
str(md_path),
|
||||
"-o",
|
||||
str(pdf_path),
|
||||
"--pdf-engine=pdflatex",
|
||||
"-V",
|
||||
"geometry:margin=2.5cm",
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
||||
self.logger.info(f"✅ PDF from Markdown: {pdf_path}")
|
||||
return pdf_path
|
||||
except Exception as e:
|
||||
self.logger.warning(f"⚠️ PDF from Markdown failed: {e}")
|
||||
return None
|
||||
|
||||
def _convert_md_to_docx(self, md_path: Path, base_name: str) -> Optional[Path]:
|
||||
"""Convert Markdown to DOCX using pandoc."""
|
||||
docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
|
||||
cmd = ["pandoc", str(md_path), "-o", str(docx_path)]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
||||
self.logger.info(f"✅ DOCX from Markdown: {docx_path}")
|
||||
return docx_path
|
||||
except Exception as e:
|
||||
self.logger.warning(f"⚠️ DOCX from Markdown failed: {e}")
|
||||
return None
|
||||
|
||||
def _cleanup_latex_aux(self, output_dir: Path, base_name: str):
|
||||
"""Clean up auxiliary LaTeX files."""
|
||||
extensions = [".aux", ".log", ".out", ".toc"]
|
||||
for ext in extensions:
|
||||
aux_file = output_dir / f"{base_name}{ext}"
|
||||
if aux_file.exists():
|
||||
try:
|
||||
aux_file.unlink()
|
||||
except:
|
||||
pass
|
||||
|
||||
def _upload_to_notion(
|
||||
self,
|
||||
base_name: str,
|
||||
summary: str,
|
||||
pdf_path: Optional[Path],
|
||||
metadata: Dict[str, Any],
|
||||
):
|
||||
"""Upload summary to Notion if configured."""
|
||||
try:
|
||||
from services.notion_service import notion_service
|
||||
|
||||
notion_uploaded = False
|
||||
notion_page_id = None
|
||||
if settings.has_notion_config:
|
||||
try:
|
||||
title = base_name.replace("_", " ").title()
|
||||
title = base_name.replace("_", " ").title()
|
||||
notion_metadata = {
|
||||
"file_type": "Audio",
|
||||
"pdf_path": pdf_path or Path(""),
|
||||
"add_status": False,
|
||||
"use_as_page": False,
|
||||
}
|
||||
|
||||
# Crear página con el contenido completo del resumen
|
||||
notion_metadata = {
|
||||
"file_type": "Audio", # O 'PDF' dependiendo del origen
|
||||
"pdf_path": pdf_path if pdf_path else Path(""),
|
||||
"add_status": False, # No usar Status/Tipo (no existen en la DB)
|
||||
"use_as_page": False, # Usar como database, no página
|
||||
}
|
||||
page_id = notion_service.create_page_with_summary(
|
||||
title=title, summary=summary, metadata=notion_metadata
|
||||
)
|
||||
|
||||
notion_page_id = notion_service.create_page_with_summary(
|
||||
title=title, summary=summary, metadata=notion_metadata
|
||||
)
|
||||
|
||||
if notion_page_id:
|
||||
notion_uploaded = True
|
||||
self.logger.info(
|
||||
f"✅ Resumen subido a Notion: {title} (ID: {notion_page_id})"
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"⚠️ No se pudo subir a Notion: {title}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"❌ Error al subir a Notion: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
if page_id:
|
||||
metadata["notion_uploaded"] = True
|
||||
metadata["notion_page_id"] = page_id
|
||||
self.logger.info(f"✅ Uploaded to Notion: {title}")
|
||||
else:
|
||||
self.logger.info("Notion not configured - skipping upload")
|
||||
|
||||
metadata = {
|
||||
"markdown_path": str(markdown_path),
|
||||
"docx_path": str(docx_path) if docx_path else "",
|
||||
"pdf_path": str(pdf_path) if pdf_path else "",
|
||||
"docx_name": Path(docx_path).name if docx_path else "",
|
||||
"summary": summary,
|
||||
"filename": filename,
|
||||
"notion_uploaded": notion_uploaded,
|
||||
"notion_page_id": notion_page_id,
|
||||
}
|
||||
|
||||
return True, summary, metadata
|
||||
self.logger.warning(f"⚠️ Notion upload failed: {title}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Document generation process failed: {e}")
|
||||
return False, "", {}
|
||||
|
||||
def _sanitize_latex(self, markdown_path: Path) -> None:
|
||||
"""Sanitize LaTeX syntax in Markdown file to prevent Pandoc errors"""
|
||||
try:
|
||||
content = markdown_path.read_text(encoding="utf-8")
|
||||
|
||||
# 1. Unescape escaped dollar signs which are common LLM errors for math
|
||||
content = content.replace(r"\$", "$")
|
||||
|
||||
# 2. Fix common Cyrillic and Greek characters that sneak in via LLMs
|
||||
replacements = {
|
||||
"ч": "ch",
|
||||
"в": "v",
|
||||
"к": "k",
|
||||
"м": "m",
|
||||
"н": "n",
|
||||
"т": "t",
|
||||
"—": "-",
|
||||
"–": "-",
|
||||
"“": '"',
|
||||
"”": '"',
|
||||
"’": "'",
|
||||
"Δ": "$\\Delta$",
|
||||
"δ": "$\\delta$",
|
||||
"Σ": "$\\Sigma$",
|
||||
"σ": "$\\sigma$",
|
||||
"π": "$\\pi$",
|
||||
"Π": "$\\Pi$",
|
||||
"α": "$\\alpha$",
|
||||
"β": "$\\beta$",
|
||||
"γ": "$\\gamma$",
|
||||
"θ": "$\\theta$",
|
||||
"λ": "$\\lambda$",
|
||||
"μ": "$\\mu$",
|
||||
}
|
||||
|
||||
# Be careful not to double-replace already correct LaTeX
|
||||
for char, repl in replacements.items():
|
||||
if char in content:
|
||||
# Check if it's already inside math mode would be complex,
|
||||
# but for now we assume raw unicode greek chars should become latex
|
||||
content = content.replace(char, repl)
|
||||
|
||||
markdown_path.write_text(content, encoding="utf-8")
|
||||
self.logger.info(f"Sanitized LaTeX in {markdown_path}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to sanitize LaTeX: {e}")
|
||||
|
||||
def _generate_filename(self, text: str, summary: str) -> str:
|
||||
"""Generate intelligent filename"""
|
||||
try:
|
||||
# Use AI to extract key topics
|
||||
prompt = f"""Extract 2-3 key topics from this summary to create a filename.
|
||||
Summary: {summary}
|
||||
|
||||
Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
|
||||
|
||||
try:
|
||||
topics_text = self.ai_provider.generate_text(prompt)
|
||||
except Exception:
|
||||
topics_text = summary[:100]
|
||||
|
||||
# Simple topic extraction
|
||||
topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3]
|
||||
if not topics:
|
||||
topics = ["documento"]
|
||||
|
||||
# Limit topic length
|
||||
topics = [t[: settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
|
||||
|
||||
filename = "_".join(topics)[: settings.MAX_FILENAME_LENGTH]
|
||||
return filename
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Filename generation failed: {e}")
|
||||
return "documento"
|
||||
|
||||
def _create_markdown(self, summary: str, base_name: str) -> Path:
|
||||
"""Create Markdown document"""
|
||||
output_dir = settings.LOCAL_DOWNLOADS_PATH
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_path = output_dir / f"{base_name}_unificado.md"
|
||||
|
||||
content = f"""# {base_name.replace("_", " ").title()}
|
||||
|
||||
## Resumen
|
||||
|
||||
{summary}
|
||||
|
||||
---
|
||||
|
||||
*Generado por CBCFacil*
|
||||
"""
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
|
||||
return output_path
|
||||
|
||||
def _create_docx(self, markdown_path: Path, base_name: str) -> Path:
|
||||
"""Create DOCX document using pandoc"""
|
||||
output_dir = settings.LOCAL_DOCX
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_path = output_dir / f"{base_name}_unificado.docx"
|
||||
|
||||
self.logger.info(
|
||||
f"Converting Markdown to DOCX: {markdown_path} -> {output_path}"
|
||||
)
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
"pandoc",
|
||||
str(markdown_path),
|
||||
"-o",
|
||||
str(output_path),
|
||||
"--from=markdown",
|
||||
"--to=docx",
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
|
||||
self.logger.info("DOCX generated successfully with pandoc")
|
||||
return output_path
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
self.logger.error(f"Pandoc DOCX conversion failed: {e.stderr}")
|
||||
raise FileProcessingError(f"Failed to generate DOCX: {e.stderr}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating DOCX: {e}")
|
||||
raise FileProcessingError(f"Error generating DOCX: {e}")
|
||||
|
||||
def _create_pdf(self, markdown_path: Path, base_name: str) -> Path:
|
||||
"""Create PDF document using pandoc and pdflatex"""
|
||||
output_dir = settings.LOCAL_DOWNLOADS_PATH
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_path = output_dir / f"{base_name}_unificado.pdf"
|
||||
|
||||
self.logger.info(
|
||||
f"Converting Markdown to PDF: {markdown_path} -> {output_path}"
|
||||
)
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
"pandoc",
|
||||
str(markdown_path),
|
||||
"-o",
|
||||
str(output_path),
|
||||
"--pdf-engine=pdflatex",
|
||||
"-V",
|
||||
"geometry:margin=2.5cm",
|
||||
"-V",
|
||||
"fontsize=12pt",
|
||||
"--highlight-style=tango",
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
|
||||
self.logger.info("PDF generated successfully with pandoc")
|
||||
return output_path
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
self.logger.error(f"Pandoc PDF conversion failed: {e.stderr}")
|
||||
raise FileProcessingError(f"Failed to generate PDF: {e.stderr}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating PDF: {e}")
|
||||
raise FileProcessingError(f"Error generating PDF: {e}")
|
||||
self.logger.warning(f"❌ Notion upload error: {e}")
|
||||
|
||||
Reference in New Issue
Block a user