fix: Mejora parser de tablas LaTeX

- Elimina líneas hline duplicadas
- Mejora limpieza de comandos LaTeX en celdas
- Usa regex para manejar {contenido}
- Filtra celdas vacías

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
renato97
2026-02-25 17:32:18 +00:00
parent d902203b59
commit d50772d962

View File

@@ -94,13 +94,20 @@ class PDFGenerator:
break
if in_table:
# Saltar líneas de hline
if "hline" in line.replace("\\", "").replace(" ", ""):
i += 1
continue
# Procesar línea de tabla
# Reemplazar & por separador y eliminar \\
row_data = line.replace("&", "|").replace("\\", "").replace("\\\\", "")
# Limpiar formato LaTeX básico
row_data = row_data.replace("hline", "").replace("\\hline", "")
# Reemplazar & por separador
row_data = line.replace("&", "|")
# Eliminar comandos LaTeX
row_data = row_data.replace("\\", "").replace("\\\\", "").replace("hline", "")
cells = [c.strip() for c in row_data.split("|") if c.strip()]
if cells:
# Filtrar celdas vacías
cells = [c for c in cells if c and c != "|"]
if cells and len(cells) > 1: # Al menos 2 columnas para ser tabla válida
table_lines.append({"type": "row", "data": cells})
i += 1
@@ -118,11 +125,15 @@ class PDFGenerator:
row = []
for cell in tl["data"]:
cell = cell.strip()
# Eliminar comandos LaTeX restantes
cell = cell.replace("\\textbf{", "").replace("}", "")
cell = cell.replace("\\textit{", "")
cell = cell.replace("\\emph{", "")
# Eliminar comandos LaTeX restantes (manejar {contenido})
import re
# Eliminar \textbf{...}, \textit{...}, \emph{...}
cell = re.sub(r'\\textbf\{([^}]*)\}', r'\1', cell)
cell = re.sub(r'\\textit\{([^}]*)\}', r'\1', cell)
cell = re.sub(r'\\emph\{([^}]*)\}', r'\1', cell)
cell = cell.replace("\\", "").replace("{", "").replace("}", "")
cell = cell.strip()
if cell:
row.append(cell)
if row:
data.append(row)