feat: Sistema LaTeX mejorado con sanitización automática y corrección de TikZ

Cambios principales:

## Nuevos archivos
- services/ai/parallel_provider.py: Ejecución paralela de múltiples proveedores AI
- services/ai/prompt_manager.py: Gestión centralizada de prompts (resumen.md como fuente)
- latex/resumen.md: Template del prompt para resúmenes académicos LaTeX

## Mejoras en generación LaTeX (document/generators.py)
- Nueva función _sanitize_latex(): Corrige automáticamente errores comunes de AI
  - Agrega align=center a nodos TikZ con saltos de línea (\\)
  - Previene errores 'Not allowed in LR mode' antes de compilar
- Soporte para procesamiento paralelo de proveedores AI
- Conversión DOCX en paralelo con generación PDF
- Uploads a Notion en background (non-blocking)
- Callbacks de notificación para progreso en Telegram

## Mejoras en proveedores AI
- claude_provider.py: fix_latex() con instrucciones específicas para errores TikZ
- gemini_provider.py: fix_latex() mejorado + rate limiting + circuit breaker
- provider_factory.py: Soporte para parallel provider

## Otros cambios
- config/settings.py: Nuevas configuraciones para Gemini models
- services/webdav_service.py: Mejoras en manejo de conexión
- .gitignore: Ignora archivos LaTeX auxiliares (.aux, .toc, .out, .pdf)

## Archivos de ejemplo
- latex/imperio_romano.tex, latex/clase_revolucion_rusa_crisis_30.tex
- resumen_curiosidades.tex (corregido y compilado exitosamente)
This commit is contained in:
renato97
2026-02-07 20:50:27 +00:00
parent 915f827305
commit dcf887c510
15 changed files with 4309 additions and 409 deletions

View File

@@ -1,352 +1,669 @@
"""
Document generation utilities
Document generation utilities - LaTeX Academic Summary System
This module generates comprehensive academic summaries in LaTeX format
following the specifications in latex/resumen.md (the SINGLE SOURCE OF TRUTH).
Parallel Processing: Uses multiple agents for accelerated summary generation:
- AI Provider Racing: Multiple AI providers generate in parallel
- Parallel Format Conversion: PDF + DOCX generated simultaneously
- Background Notion Uploads: Non-blocking uploads to Notion
"""
import logging
import subprocess
import shutil
import re
import threading
from pathlib import Path
from typing import Dict, Any, List, Tuple
from typing import Dict, Any, Optional, Tuple, Callable
from concurrent.futures import ThreadPoolExecutor, as_completed
from core import FileProcessingError
from config import settings
from services.ai import ai_provider_factory
from services.ai.prompt_manager import prompt_manager
def _sanitize_latex(latex_code: str) -> str:
"""
Pre-process LaTeX code to fix common errors before compilation.
This function applies automated fixes for known issues that AI models
frequently generate, reducing the need for fix_latex() iterations.
Currently handles:
- TikZ nodes with line breaks (\\\\) missing align=center
- Unbalanced environments (best effort)
"""
if not latex_code:
return latex_code
result = latex_code
# Fix TikZ nodes with \\\\ but missing align=center
# Pattern: \node[...] (name) {Text\\More};
# This is a common AI error - TikZ requires align=center for \\\\ in nodes
# We need to find \node commands and add align=center if they have \\\\ in content
# but don't already have align= in their options
def fix_tikz_node(match):
"""Fix a single TikZ node by adding align=center if needed"""
full_match = match.group(0)
options = match.group(1) # Content inside [...]
rest = match.group(2) # Everything after options
# Check if this node has \\\\ in its content (text between { })
# and doesn't already have align=
if "\\\\" in rest and "align=" not in options:
# Add align=center to the options
if options.strip():
new_options = options.rstrip() + ", align=center"
else:
new_options = "align=center"
return f"\\node[{new_options}]{rest}"
return full_match
# Match \node[options] followed by rest of the line
# Capture options and the rest separately
tikz_node_pattern = r"\\node\[([^\]]*)\]([^;]*;)"
result = re.sub(tikz_node_pattern, fix_tikz_node, result)
return result
class DocumentGenerator:
"""Generate documents from processed text"""
"""
Generates academic summary documents in LaTeX format.
def __init__(self):
The system follows these principles:
1. latex/resumen.md is the SINGLE SOURCE OF TRUTH for prompt structure
2. Generates full LaTeX documents (not Markdown)
3. Compiles to PDF using pdflatex
4. Supports iterative fixing with AI if compilation fails
5. Supports progress notifications via callback
"""
def __init__(self, notification_callback: Optional[Callable[[str], None]] = None):
"""
Initialize DocumentGenerator.
Args:
notification_callback: Optional callback function for progress notifications
Takes a single string argument (message to send)
"""
self.logger = logging.getLogger(__name__)
self.ai_provider = ai_provider_factory.get_best_provider()
self.notification_callback = notification_callback
self.use_parallel = ai_provider_factory.use_parallel()
self.executor = ThreadPoolExecutor(max_workers=4)
# Ensure output directories exist
settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True)
settings.LOCAL_DOCX.mkdir(parents=True, exist_ok=True)
if self.use_parallel:
self.logger.info(
"🚀 Parallel processing enabled: Multiple AI providers available"
)
def _notify(self, message: str) -> None:
"""Send notification if callback is configured"""
if self.notification_callback:
try:
self.notification_callback(message)
except Exception as e:
self.logger.warning(f"Failed to send notification: {e}")
def _generate_with_parallel_provider(self, prompt: str, **kwargs) -> str:
"""
Generate content using multiple AI providers in parallel.
Races multiple providers and returns the first successful response,
or the best quality response if using consensus strategy.
"""
try:
parallel_provider = ai_provider_factory.get_parallel_provider(max_workers=4)
self.logger.info("🚀 Using parallel AI provider (race mode)")
result = parallel_provider.generate_parallel(
prompt=prompt,
strategy="race", # Use first successful response
timeout_ms=300000, # 5 minutes
**kwargs,
)
self.logger.info(
f"✅ Parallel generation complete: {result.selected_provider} selected, "
f"{result.total_duration_ms}ms"
)
return result.content
except Exception as e:
self.logger.warning(
f"⚠️ Parallel generation failed: {e}, falling back to single provider"
)
return self.ai_provider.generate_text(prompt, **kwargs)
def _convert_formats_parallel(
self, tex_path: Path, pdf_path: Optional[Path], base_name: str
) -> Optional[Path]:
"""
Convert to multiple formats in parallel (DOCX, optionally PDF).
If PDF is already compiled, only DOCX is generated.
Otherwise, both PDF and DOCX are generated in parallel.
"""
futures = {}
# Generate DOCX
if shutil.which("pandoc"):
futures["docx"] = self.executor.submit(
self._convert_tex_to_docx, tex_path, base_name
)
# Wait for DOCX completion
docx_path = None
if "docx" in futures:
try:
docx_path = futures["docx"].result(timeout=60)
if docx_path:
self.logger.info(f"✅ Parallel DOCX generated: {docx_path}")
except Exception as e:
self.logger.warning(f"⚠️ DOCX generation failed: {e}")
return docx_path
def _upload_to_notion_background(
self,
base_name: str,
summary: str,
pdf_path: Optional[Path],
metadata: Dict[str, Any],
):
"""Upload to Notion in background thread (non-blocking)."""
def upload_worker():
try:
from services.notion_service import notion_service
title = base_name.replace("_", " ").title()
notion_metadata = {
"file_type": "Audio",
"pdf_path": pdf_path or Path(""),
"add_status": False,
"use_as_page": False,
}
page_id = notion_service.create_page_with_summary(
title=title, summary=summary, metadata=notion_metadata
)
if page_id:
metadata["notion_uploaded"] = True
metadata["notion_page_id"] = page_id
self.logger.info(
f"✅ Background upload to Notion complete: {title}"
)
else:
self.logger.warning(f"⚠️ Background Notion upload failed: {title}")
except Exception as e:
self.logger.warning(f"❌ Background Notion upload error: {e}")
# Start background thread
thread = threading.Thread(target=upload_worker, daemon=True)
thread.start()
self.logger.info("🔄 Notion upload started in background")
def generate_summary(
self, text: str, base_name: str
self,
text: str,
base_name: str,
materia: str = "Economía",
bibliographic_text: Optional[str] = None,
class_number: Optional[int] = None,
) -> Tuple[bool, str, Dict[str, Any]]:
"""Generate unified summary"""
self.logger.info(f"Generating summary for {base_name}")
"""
Generate comprehensive academic summary in LaTeX format.
Args:
text: The class transcription text
base_name: Base filename for output files
materia: Subject name (default: "Economía")
bibliographic_text: Optional supporting material from books/notes
class_number: Optional class number for header
Returns:
Tuple of (success, summary_text, metadata)
"""
self.logger.info(
f"🚀 Starting LaTeX academic summary generation for: {base_name}"
)
metadata = {
"filename": base_name,
"tex_path": "",
"pdf_path": "",
"markdown_path": "",
"docx_path": "",
"summary_snippet": "",
"notion_uploaded": False,
"notion_page_id": None,
"materia": materia,
}
try:
# Step 1: Generate Bullet Points (Chunking handled by provider or single prompt for now)
# Note: We use the main provider (Claude/Zai) for content generation
self.logger.info("Generating bullet points...")
bullet_prompt = f"""Analiza el siguiente texto y extrae entre 5 y 8 bullet points clave en español.
# === STEP 1: Generate LaTeX content using AI ===
self.logger.info(
"🧠 Sending request to AI Provider for LaTeX generation..."
)
self._notify("📝 Preparando prompt de resumen académico...")
REGLAS ESTRICTAS:
1. Devuelve ÚNICAMENTE bullet points, cada línea iniciando con "- "
2. Cada bullet debe ser conciso (12-20 palabras) y resaltar datos, fechas, conceptos o conclusiones importantes
3. NO agregues introducciones, conclusiones ni texto explicativo
4. Concéntrate en los puntos más importantes del texto
5. Incluye fechas, datos específicos y nombres relevantes si los hay
prompt = prompt_manager.get_latex_summary_prompt(
transcription=text,
materia=materia,
bibliographic_text=bibliographic_text,
class_number=class_number,
)
Texto:
{text[:15000]}""" # Truncate to avoid context limits if necessary, though providers handle it differently
self._notify(
"🧠 Enviando solicitud a la IA (esto puede tardar unos minutos)..."
)
try:
bullet_points = self.ai_provider.generate_text(bullet_prompt)
self.logger.info(f"Bullet points generated: {len(bullet_points)}")
except Exception as e:
self.logger.warning(f"Bullet point generation failed: {e}")
bullet_points = "- Puntos clave no disponibles por error en IA"
# Use parallel provider if multiple AI providers are available
if self.use_parallel:
raw_response = self._generate_with_parallel_provider(prompt)
else:
raw_response = self.ai_provider.generate_text(prompt)
# Step 2: Generate Unified Summary
self.logger.info("Generating unified summary...")
summary_prompt = f"""Eres un profesor universitario experto en historia y economía. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.
if not raw_response:
raise FileProcessingError("AI returned empty response")
REQUISITOS ESTRICTOS DE CONTENIDO:
- Extensión entre 500-700 palabras
- Usa encabezados Markdown con jerarquía clara (##, ###)
- Desarrolla los puntos clave con profundidad y contexto histórico/económico
- Mantén un tono académico y analítico
- Incluye conclusiones significativas
- NO agregues texto fuera del resumen
- Devuelve únicamente el resumen en formato Markdown
self.logger.info(f"📝 AI response received: {len(raw_response)} characters")
self._notify(f"✅ Respuesta recibida ({len(raw_response)} caracteres)")
REQUISITOS ESTRICTOS DE FORMATO MATEMÁTICO (LaTeX):
- Si el texto incluye fórmulas matemáticas o económicas, DEBES usar formato LaTeX.
- Usa bloques $$ ... $$ para ecuaciones centradas importantes.
- Usa $ ... $ para ecuaciones en línea.
- Ejemplo: La fórmula del interés compuesto es $A = P(1 + r/n)^{{nt}}$.
- NO uses bloques de código (```latex) para las fórmulas, úsalas directamente en el texto para que Pandoc las renderice.
# === STEP 2: Extract clean LaTeX from AI response ===
self._notify("🔍 Extrayendo código LaTeX...")
Contenido a resumir:
{text[:20000]}
latex_content = prompt_manager.extract_latex_from_response(raw_response)
Puntos clave a incluir obligatoriamente:
{bullet_points}"""
if not latex_content:
self.logger.warning(
"⚠️ No valid LaTeX found in response, treating as Markdown"
)
self._notify("⚠️ No se detectó LaTeX válido, usando modo compatible...")
# Fallback to Markdown processing
return self._fallback_to_markdown(raw_response, base_name, metadata)
try:
raw_summary = self.ai_provider.generate_text(summary_prompt)
except Exception as e:
self.logger.error(f"Raw summary generation failed: {e}")
raise e
self.logger.info("✨ Valid LaTeX content detected")
self._notify(f"✨ LaTeX detectado: {len(latex_content)} caracteres")
# Step 3: Format with IA (using main provider instead of Gemini)
self.logger.info("Formatting summary with IA...")
format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible y compatible con Pandoc:
# === STEP 3: Compilation Loop with Self-Correction ===
max_retries = 3
current_latex = latex_content
{raw_summary}
for attempt in range(max_retries + 1):
# Sanitize LaTeX before saving (fix common AI errors like TikZ nodes)
current_latex = _sanitize_latex(current_latex)
Instrucciones:
- Corrige cualquier error de formato Markdown
- Asegúrate de que los encabezados estén bien espaciados
- Verifica que las viñetas usen "- " correctamente
- Mantén exactamente el contenido existente
- EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave
- VERIFICA que todas las fórmulas matemáticas estén correctamente encerradas en $...$ (inline) o $$...$$ (display)
- NO alteres la sintaxis LaTeX dentro de los delimitadores $...$ o $$...$$
- Devuelve únicamente el resumen formateado sin texto adicional"""
# Save current .tex file
self._notify(
f"📄 Guardando archivo .tex (intento {attempt + 1}/{max_retries + 1})..."
)
try:
# Use the main provider (Claude/GLM) for formatting too
if self.ai_provider.is_available():
summary = self.ai_provider.generate_text(format_prompt)
else:
self.logger.warning(
"AI provider not available for formatting, using raw summary"
tex_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.tex"
tex_path.write_text(current_latex, encoding="utf-8")
metadata["tex_path"] = str(tex_path)
# Try to compile
self._notify("⚙️ Primera pasada de compilación LaTeX...")
pdf_path = self._compile_latex(
tex_path, output_dir=settings.LOCAL_DOWNLOADS_PATH
)
if pdf_path:
self.logger.info(
f"✅ Compilation success on attempt {attempt + 1}!"
)
summary = raw_summary
except Exception as e:
self.logger.warning(f"Formatting failed ({e}), using raw summary")
summary = raw_summary
self._notify("✅ PDF generado exitosamente!")
metadata["pdf_path"] = str(pdf_path)
# Generate filename
filename = self._generate_filename(text, summary)
# Generate DOCX in parallel
self._notify("📄 Generando archivo DOCX en paralelo...")
docx_path = self._convert_formats_parallel(
tex_path, pdf_path, base_name
)
if docx_path:
self._notify("✅ DOCX generado exitosamente!")
metadata["docx_path"] = str(docx_path)
# Create document
markdown_path = self._create_markdown(summary, base_name)
# Create a text summary for Notion/preview
text_summary = self._create_text_summary(current_latex)
metadata["summary_snippet"] = text_summary[:500] + "..."
docx_path = None
try:
docx_path = self._create_docx(markdown_path, base_name)
except Exception as e:
self.logger.error(f"Failed to create DOCX (non-critical): {e}")
# Upload to Notion in background if configured
if settings.has_notion_config:
self._notify("📤 Iniciando carga a Notion en segundo plano...")
self._upload_to_notion_background(
base_name=base_name,
summary=text_summary,
pdf_path=pdf_path,
metadata=metadata,
)
pdf_path = None
try:
# Sanitize LaTeX before PDF generation
self._sanitize_latex(markdown_path)
pdf_path = self._create_pdf(markdown_path, base_name)
except Exception as e:
self.logger.error(f"Failed to create PDF (non-critical): {e}")
self._notify("🎉 ¡Resumen completado con éxito!")
return True, text_summary, metadata
# Upload to Notion if configured
# Compilation failed - ask AI to fix
if attempt < max_retries:
self.logger.warning(
f"⚠️ Compilation failed (Attempt {attempt + 1}/{max_retries + 1}). "
f"Requesting AI fix..."
)
self._notify(
f"⚠️ Error de compilación ({attempt + 1}/{max_retries + 1}), solicitando corrección a IA..."
)
# Get error log
log_file = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.log"
error_log = "Log file not found"
if log_file.exists():
error_log = log_file.read_text(
encoding="utf-8", errors="ignore"
)[-2000:]
# Ask AI to fix
try:
self._notify("🔧 La IA está corrigiendo el código LaTeX...")
if hasattr(self.ai_provider, "fix_latex"):
fixed_latex = self.ai_provider.fix_latex(
current_latex, error_log
)
cleaned = prompt_manager.extract_latex_from_response(
fixed_latex
)
if cleaned:
current_latex = cleaned
else:
current_latex = fixed_latex
self._notify(
"✅ Código LaTeX corregido, reintentando compilación..."
)
else:
self.logger.error(
"❌ AI provider doesn't support fix_latex()"
)
break
except Exception as e:
self.logger.error(f"❌ AI fix request failed: {e}")
break
else:
self.logger.error(
"❌ Max retries reached. LaTeX compilation failed."
)
self._notify(
"❌ No se pudo compilar el LaTeX después de varios intentos"
)
# If we get here, all compilation attempts failed
self._notify("⚠️ Usando modo de compatibilidad Markdown...")
return self._fallback_to_markdown(
current_latex or raw_response, base_name, metadata
)
except Exception as e:
self.logger.error(
f"❌ Critical error in document generation: {e}", exc_info=True
)
self._notify(f"❌ Error en la generación: {str(e)[:100]}")
return False, "", metadata
def _compile_latex(self, tex_path: Path, output_dir: Path) -> Optional[Path]:
"""
Compile LaTeX to PDF using pdflatex. Runs twice for TOC.
Args:
tex_path: Path to .tex file
output_dir: Directory for output files
Returns:
Path to generated PDF or None if failed
"""
base_name = tex_path.stem
expected_pdf = output_dir / f"{base_name}.pdf"
# Check if pdflatex is available
if not shutil.which("pdflatex"):
self.logger.error("🚫 pdflatex not found in system PATH")
return None
cmd = [
"pdflatex",
"-interaction=nonstopmode",
"-halt-on-error",
f"-output-directory={output_dir}",
str(tex_path),
]
try:
# Pass 1
self.logger.info("⚙️ Compiling LaTeX (Pass 1/2)...")
subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
timeout=120,
)
# Pass 2 (for TOC resolution)
self.logger.info("⚙️ Compiling LaTeX (Pass 2/2)...")
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
timeout=120,
)
if result.returncode == 0 and expected_pdf.exists():
self.logger.info(f"✅ PDF generated: {expected_pdf}")
self._cleanup_latex_aux(output_dir, base_name)
return expected_pdf
else:
# Read log file for error info
log_file = output_dir / f"{base_name}.log"
error_snippet = "Unknown error"
if log_file.exists():
try:
log_content = log_file.read_text(
encoding="utf-8", errors="ignore"
)
errors = [
line
for line in log_content.splitlines()
if line.startswith("!")
]
if errors:
error_snippet = errors[0][:200]
except:
pass
self.logger.error(f"❌ LaTeX compilation failed: {error_snippet}")
return None
except subprocess.TimeoutExpired:
self.logger.error("❌ LaTeX compilation timed out")
return None
except Exception as e:
self.logger.error(f"❌ Error during LaTeX execution: {e}")
return None
def _convert_tex_to_docx(self, tex_path: Path, base_name: str) -> Optional[Path]:
"""Convert .tex to .docx using Pandoc."""
if not shutil.which("pandoc"):
self.logger.warning("⚠️ pandoc not found, skipping DOCX generation")
return None
docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
cmd = ["pandoc", str(tex_path), "-o", str(docx_path)]
try:
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
self.logger.info(f"✅ DOCX generated: {docx_path}")
return docx_path
except Exception as e:
self.logger.warning(f"⚠️ DOCX generation failed: {e}")
return None
def _create_text_summary(self, latex_content: str) -> str:
"""Extract a plain text summary from LaTeX content for Notion/preview."""
# Remove LaTeX commands and keep content
text = latex_content
# Remove document class and packages
text = re.sub(r"\\documentclass\[?[^\]]*\]?\{[^\}]+\}", "", text)
text = re.sub(r"\\usepackage\{[^\}]+\}", "", text)
text = re.sub(r"\\geometry\{[^\}]+\}", "", text)
text = re.sub(r"\\pagestyle\{[^\}]+\}", "", text)
text = re.sub(r"\\fancyhf\{\}", "", text)
text = re.sub(r"\\fancyhead\[?[^\]]*\]?\{[^\}]+\}", "", text)
text = re.sub(r"\\fancyfoot\[?[^\]]*\]?\{[^\}]+\}", "", text)
# Convert sections to markdown-style
text = re.sub(r"\\section\*?\{([^\}]+)\}", r"# \1", text)
text = re.sub(r"\\subsection\*?\{([^\}]+)\}", r"## \1", text)
text = re.sub(r"\\subsubsection\*?\{([^\}]+)\}", r"### \1", text)
# Remove tcolorbox environments (keep content)
text = re.sub(
r"\\begin\{(definicion|importante|ejemplo)\}\[?[^\]]*\]?",
r"\n**\1:** ",
text,
)
text = re.sub(r"\\end\{(definicion|importante|ejemplo)\}", "", text)
# Convert itemize to bullets
text = re.sub(r"\\item\s*", "- ", text)
text = re.sub(r"\\begin\{(itemize|enumerate)\}", "", text)
text = re.sub(r"\\end\{(itemize|enumerate)\}", "", text)
# Clean up math (basic)
text = re.sub(r"\$\$([^\$]+)\$\$", r"\n\n\1\n\n", text)
text = re.sub(r"\$([^\$]+)\$", r"\1", text)
# Remove remaining LaTeX commands
text = re.sub(r"\\[a-zA-Z]+(\{[^\}]*\})*", "", text)
text = re.sub(r"[{}]", "", text)
# Clean whitespace
text = re.sub(r"\n\s*\n\s*\n", "\n\n", text)
text = text.strip()
return text
def _fallback_to_markdown(
self, content: str, base_name: str, metadata: Dict[str, Any]
) -> Tuple[bool, str, Dict[str, Any]]:
"""Fallback when LaTeX generation fails."""
self.logger.warning("⚠️ Falling back to Markdown processing")
md_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_resumen.md"
md_path.write_text(content, encoding="utf-8")
metadata["markdown_path"] = str(md_path)
# Try to convert to PDF via pandoc
if shutil.which("pandoc"):
pdf_path = self._convert_md_to_pdf(md_path, base_name)
if pdf_path:
metadata["pdf_path"] = str(pdf_path)
docx_path = self._convert_md_to_docx(md_path, base_name)
if docx_path:
metadata["docx_path"] = str(docx_path)
metadata["summary_snippet"] = content[:500] + "..."
return True, content, metadata
def _convert_md_to_pdf(self, md_path: Path, base_name: str) -> Optional[Path]:
"""Convert Markdown to PDF using pandoc."""
pdf_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.pdf"
cmd = [
"pandoc",
str(md_path),
"-o",
str(pdf_path),
"--pdf-engine=pdflatex",
"-V",
"geometry:margin=2.5cm",
]
try:
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
self.logger.info(f"✅ PDF from Markdown: {pdf_path}")
return pdf_path
except Exception as e:
self.logger.warning(f"⚠️ PDF from Markdown failed: {e}")
return None
def _convert_md_to_docx(self, md_path: Path, base_name: str) -> Optional[Path]:
"""Convert Markdown to DOCX using pandoc."""
docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
cmd = ["pandoc", str(md_path), "-o", str(docx_path)]
try:
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
self.logger.info(f"✅ DOCX from Markdown: {docx_path}")
return docx_path
except Exception as e:
self.logger.warning(f"⚠️ DOCX from Markdown failed: {e}")
return None
def _cleanup_latex_aux(self, output_dir: Path, base_name: str):
"""Clean up auxiliary LaTeX files."""
extensions = [".aux", ".log", ".out", ".toc"]
for ext in extensions:
aux_file = output_dir / f"{base_name}{ext}"
if aux_file.exists():
try:
aux_file.unlink()
except:
pass
def _upload_to_notion(
self,
base_name: str,
summary: str,
pdf_path: Optional[Path],
metadata: Dict[str, Any],
):
"""Upload summary to Notion if configured."""
try:
from services.notion_service import notion_service
notion_uploaded = False
notion_page_id = None
if settings.has_notion_config:
try:
title = base_name.replace("_", " ").title()
title = base_name.replace("_", " ").title()
notion_metadata = {
"file_type": "Audio",
"pdf_path": pdf_path or Path(""),
"add_status": False,
"use_as_page": False,
}
# Crear página con el contenido completo del resumen
notion_metadata = {
"file_type": "Audio", # O 'PDF' dependiendo del origen
"pdf_path": pdf_path if pdf_path else Path(""),
"add_status": False, # No usar Status/Tipo (no existen en la DB)
"use_as_page": False, # Usar como database, no página
}
page_id = notion_service.create_page_with_summary(
title=title, summary=summary, metadata=notion_metadata
)
notion_page_id = notion_service.create_page_with_summary(
title=title, summary=summary, metadata=notion_metadata
)
if notion_page_id:
notion_uploaded = True
self.logger.info(
f"✅ Resumen subido a Notion: {title} (ID: {notion_page_id})"
)
else:
self.logger.warning(f"⚠️ No se pudo subir a Notion: {title}")
except Exception as e:
self.logger.warning(f"❌ Error al subir a Notion: {e}")
import traceback
traceback.print_exc()
if page_id:
metadata["notion_uploaded"] = True
metadata["notion_page_id"] = page_id
self.logger.info(f"✅ Uploaded to Notion: {title}")
else:
self.logger.info("Notion not configured - skipping upload")
metadata = {
"markdown_path": str(markdown_path),
"docx_path": str(docx_path) if docx_path else "",
"pdf_path": str(pdf_path) if pdf_path else "",
"docx_name": Path(docx_path).name if docx_path else "",
"summary": summary,
"filename": filename,
"notion_uploaded": notion_uploaded,
"notion_page_id": notion_page_id,
}
return True, summary, metadata
self.logger.warning(f"⚠️ Notion upload failed: {title}")
except Exception as e:
self.logger.error(f"Document generation process failed: {e}")
return False, "", {}
def _sanitize_latex(self, markdown_path: Path) -> None:
"""Sanitize LaTeX syntax in Markdown file to prevent Pandoc errors"""
try:
content = markdown_path.read_text(encoding="utf-8")
# 1. Unescape escaped dollar signs which are common LLM errors for math
content = content.replace(r"\$", "$")
# 2. Fix common Cyrillic and Greek characters that sneak in via LLMs
replacements = {
"ч": "ch",
"в": "v",
"к": "k",
"м": "m",
"н": "n",
"т": "t",
"": "-",
"": "-",
"": '"',
"": '"',
"": "'",
"Δ": "$\\Delta$",
"δ": "$\\delta$",
"Σ": "$\\Sigma$",
"σ": "$\\sigma$",
"π": "$\\pi$",
"Π": "$\\Pi$",
"α": "$\\alpha$",
"β": "$\\beta$",
"γ": "$\\gamma$",
"θ": "$\\theta$",
"λ": "$\\lambda$",
"μ": "$\\mu$",
}
# Be careful not to double-replace already correct LaTeX
for char, repl in replacements.items():
if char in content:
# Check if it's already inside math mode would be complex,
# but for now we assume raw unicode greek chars should become latex
content = content.replace(char, repl)
markdown_path.write_text(content, encoding="utf-8")
self.logger.info(f"Sanitized LaTeX in {markdown_path}")
except Exception as e:
self.logger.warning(f"Failed to sanitize LaTeX: {e}")
def _generate_filename(self, text: str, summary: str) -> str:
"""Generate intelligent filename"""
try:
# Use AI to extract key topics
prompt = f"""Extract 2-3 key topics from this summary to create a filename.
Summary: {summary}
Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
try:
topics_text = self.ai_provider.generate_text(prompt)
except Exception:
topics_text = summary[:100]
# Simple topic extraction
topics = re.findall(r"\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b", topics_text)[:3]
if not topics:
topics = ["documento"]
# Limit topic length
topics = [t[: settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
filename = "_".join(topics)[: settings.MAX_FILENAME_LENGTH]
return filename
except Exception as e:
self.logger.error(f"Filename generation failed: {e}")
return "documento"
def _create_markdown(self, summary: str, base_name: str) -> Path:
"""Create Markdown document"""
output_dir = settings.LOCAL_DOWNLOADS_PATH
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.md"
content = f"""# {base_name.replace("_", " ").title()}
## Resumen
{summary}
---
*Generado por CBCFacil*
"""
with open(output_path, "w", encoding="utf-8") as f:
f.write(content)
return output_path
def _create_docx(self, markdown_path: Path, base_name: str) -> Path:
"""Create DOCX document using pandoc"""
output_dir = settings.LOCAL_DOCX
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.docx"
self.logger.info(
f"Converting Markdown to DOCX: {markdown_path} -> {output_path}"
)
try:
cmd = [
"pandoc",
str(markdown_path),
"-o",
str(output_path),
"--from=markdown",
"--to=docx",
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
self.logger.info("DOCX generated successfully with pandoc")
return output_path
except subprocess.CalledProcessError as e:
self.logger.error(f"Pandoc DOCX conversion failed: {e.stderr}")
raise FileProcessingError(f"Failed to generate DOCX: {e.stderr}")
except Exception as e:
self.logger.error(f"Error generating DOCX: {e}")
raise FileProcessingError(f"Error generating DOCX: {e}")
def _create_pdf(self, markdown_path: Path, base_name: str) -> Path:
"""Create PDF document using pandoc and pdflatex"""
output_dir = settings.LOCAL_DOWNLOADS_PATH
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_unificado.pdf"
self.logger.info(
f"Converting Markdown to PDF: {markdown_path} -> {output_path}"
)
try:
cmd = [
"pandoc",
str(markdown_path),
"-o",
str(output_path),
"--pdf-engine=pdflatex",
"-V",
"geometry:margin=2.5cm",
"-V",
"fontsize=12pt",
"--highlight-style=tango",
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
self.logger.info("PDF generated successfully with pandoc")
return output_path
except subprocess.CalledProcessError as e:
self.logger.error(f"Pandoc PDF conversion failed: {e.stderr}")
raise FileProcessingError(f"Failed to generate PDF: {e.stderr}")
except Exception as e:
self.logger.error(f"Error generating PDF: {e}")
raise FileProcessingError(f"Error generating PDF: {e}")
self.logger.warning(f"❌ Notion upload error: {e}")