- Corrige PDFGenerator para pasar contenido (no ruta) - Agrega prompt siguiendo código.md (español, estructura académica) - Limpia thinking tokens de respuesta AI - Agrega skip de archivos ya procesados en watcher - Implementa tablas LaTeX en PDFs (reportlab Table) - Agrega load_dotenv() en main.py - Actualiza .env con MiniMax config - Agrega transcriptions/ a .gitignore Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
365 lines
12 KiB
Python
365 lines
12 KiB
Python
"""
|
|
Generador de PDFs desde texto y markdown.
|
|
|
|
Utiliza reportlab para la generación de PDFs con soporte UTF-8.
|
|
"""
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Optional, Union
|
|
|
|
from reportlab.lib import colors
|
|
from reportlab.lib.pagesizes import A4
|
|
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
|
|
from reportlab.lib.units import cm
|
|
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer, Table, TableStyle
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PDFGenerator:
|
|
"""Generador de PDFs desde texto plano o markdown."""
|
|
|
|
def __init__(self) -> None:
|
|
"""Inicializa el generador de PDFs."""
|
|
self._styles = getSampleStyleSheet()
|
|
self._setup_styles()
|
|
logger.info("PDFGenerator inicializado")
|
|
|
|
def _setup_styles(self) -> None:
|
|
"""Configura los estilos personalizados para el documento."""
|
|
self._styles.add(
|
|
ParagraphStyle(
|
|
name="CustomNormal",
|
|
parent=self._styles["Normal"],
|
|
fontSize=11,
|
|
leading=14,
|
|
spaceAfter=6,
|
|
)
|
|
)
|
|
self._styles.add(
|
|
ParagraphStyle(
|
|
name="CustomHeading1",
|
|
parent=self._styles["Heading1"],
|
|
fontSize=18,
|
|
leading=22,
|
|
spaceAfter=12,
|
|
)
|
|
)
|
|
self._styles.add(
|
|
ParagraphStyle(
|
|
name="CustomHeading2",
|
|
parent=self._styles["Heading2"],
|
|
fontSize=14,
|
|
leading=18,
|
|
spaceAfter=10,
|
|
)
|
|
)
|
|
|
|
def _escape_xml(self, text: str) -> str:
|
|
"""Escapa caracteres especiales para XML/HTML."""
|
|
return (
|
|
text.replace("&", "&")
|
|
.replace("<", "<")
|
|
.replace(">", ">")
|
|
.replace("\n", "<br/>")
|
|
)
|
|
|
|
def _parse_latex_table(self, lines: list[str], start_idx: int) -> tuple[Optional[Table], int]:
|
|
"""
|
|
Parsea una tabla LaTeX y la convierte a reportlab Table.
|
|
|
|
Returns:
|
|
(Table, end_index) - La tabla y el índice donde termina
|
|
"""
|
|
# Buscar begin/end tabular
|
|
table_lines = []
|
|
i = start_idx
|
|
in_table = False
|
|
|
|
while i < len(lines):
|
|
line = lines[i].strip()
|
|
|
|
if "\\begin{tabular}" in line or "begin{tabular}" in line:
|
|
in_table = True
|
|
# Extraer especificaciones de columnas
|
|
col_spec = "l"
|
|
if "{" in line:
|
|
col_spec = line.split("{")[1].split("}")[0] if "}" in line else "l"
|
|
table_lines.append({"type": "spec", "data": col_spec})
|
|
i += 1
|
|
continue
|
|
|
|
if "\\end{tabular}" in line or "end{tabular}" in line:
|
|
in_table = False
|
|
break
|
|
|
|
if in_table:
|
|
# Procesar línea de tabla
|
|
# Reemplazar & por separador y eliminar \\
|
|
row_data = line.replace("&", "|").replace("\\", "").replace("\\\\", "")
|
|
# Limpiar formato LaTeX básico
|
|
row_data = row_data.replace("hline", "").replace("\\hline", "")
|
|
cells = [c.strip() for c in row_data.split("|") if c.strip()]
|
|
if cells:
|
|
table_lines.append({"type": "row", "data": cells})
|
|
|
|
i += 1
|
|
|
|
if not table_lines:
|
|
return None, start_idx
|
|
|
|
# Convertir a Table de reportlab
|
|
data = []
|
|
col_widths = None
|
|
|
|
for tl in table_lines:
|
|
if tl["type"] == "row":
|
|
# Limpiar celdas de LaTeX
|
|
row = []
|
|
for cell in tl["data"]:
|
|
cell = cell.strip()
|
|
# Eliminar comandos LaTeX restantes
|
|
cell = cell.replace("\\textbf{", "").replace("}", "")
|
|
cell = cell.replace("\\textit{", "")
|
|
cell = cell.replace("\\emph{", "")
|
|
cell = cell.strip()
|
|
row.append(cell)
|
|
if row:
|
|
data.append(row)
|
|
|
|
if not data:
|
|
return None, start_idx
|
|
|
|
# Crear tabla
|
|
try:
|
|
num_cols = len(data[0]) if data else 1
|
|
table = Table(data)
|
|
table.setStyle(TableStyle([
|
|
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
|
|
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
|
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
|
('FONTSIZE', (0, 0), (-0, -1), 10),
|
|
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
|
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
|
('GRID', (0, 0), (-1, -1), 1, colors.black),
|
|
('VALIGN', (0, 0), (-1, -1), 'TOP'),
|
|
]))
|
|
return table, i
|
|
except Exception as e:
|
|
logger.warning(f"Error parsing LaTeX table: {e}")
|
|
return None, start_idx
|
|
|
|
def _parse_markdown_basic(self, markdown: str) -> list[Paragraph]:
|
|
"""
|
|
Convierte markdown básico a una lista de Paragraphs de reportlab.
|
|
|
|
Maneja: encabezados, negritas, italicas, lineas horizontales,
|
|
y saltos de linea.
|
|
"""
|
|
elements: list[Paragraph] = []
|
|
lines = markdown.split("\n")
|
|
in_list = False
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
|
|
if not line:
|
|
elements.append(Spacer(1, 0.3 * cm))
|
|
continue
|
|
|
|
# Encabezados
|
|
if line.startswith("### "):
|
|
text = self._escape_xml(line[4:])
|
|
elements.append(
|
|
Paragraph(f"<b>{text}</b>", self._styles["CustomHeading2"])
|
|
)
|
|
elif line.startswith("## "):
|
|
text = self._escape_xml(line[3:])
|
|
elements.append(
|
|
Paragraph(f"<b>{text}</b>", self._styles["CustomHeading1"])
|
|
)
|
|
elif line.startswith("# "):
|
|
text = self._escape_xml(line[2:])
|
|
elements.append(
|
|
Paragraph(f"<b><i>{text}</i></b>", self._styles["CustomHeading1"])
|
|
)
|
|
# Línea horizontal
|
|
elif line == "---" or line == "***":
|
|
elements.append(Spacer(1, 0.2 * cm))
|
|
# Tabla LaTeX
|
|
elif "begin{tabular}" in line or "begin{tabular" in line:
|
|
latex_table, end_idx = self._parse_latex_table(lines, idx)
|
|
if latex_table:
|
|
elements.append(Spacer(1, 0.3 * cm))
|
|
elements.append(latex_table)
|
|
elements.append(Spacer(1, 0.3 * cm))
|
|
idx = end_idx - 1 # Saltar las líneas de la tabla
|
|
# Lista con guiones
|
|
elif line.startswith("- ") or line.startswith("* "):
|
|
text = self._escape_xml(line[2:])
|
|
text = f"• {self._format_inline_markdown(text)}"
|
|
elements.append(Paragraph(text, self._styles["CustomNormal"]))
|
|
# Lista numerada
|
|
elif line[0].isdigit() and ". " in line:
|
|
idx = line.index(". ")
|
|
text = self._escape_xml(line[idx + 2 :])
|
|
text = self._format_inline_markdown(text)
|
|
elements.append(Paragraph(text, self._styles["CustomNormal"]))
|
|
# Párrafo normal
|
|
else:
|
|
text = self._escape_xml(line)
|
|
text = self._format_inline_markdown(text)
|
|
elements.append(Paragraph(text, self._styles["CustomNormal"]))
|
|
|
|
return elements
|
|
|
|
def _format_inline_markdown(self, text: str) -> str:
|
|
"""Convierte formato inline de markdown a HTML."""
|
|
# Negritas: **texto** -> <b>texto</b>
|
|
while "**" in text:
|
|
start = text.find("**")
|
|
end = text.find("**", start + 2)
|
|
if end == -1:
|
|
break
|
|
text = (
|
|
text[:start]
|
|
+ f"<b>{text[start+2:end]}</b>"
|
|
+ text[end + 2 :]
|
|
)
|
|
# Italicas: *texto* -> <i>texto</i>
|
|
while "*" in text:
|
|
start = text.find("*")
|
|
end = text.find("*", start + 1)
|
|
if end == -1:
|
|
break
|
|
text = (
|
|
text[:start]
|
|
+ f"<i>{text[start+1:end]}</i>"
|
|
+ text[end + 1 :]
|
|
)
|
|
return text
|
|
|
|
def markdown_to_pdf(self, markdown_text: str, output_path: Path) -> Path:
|
|
"""
|
|
Convierte markdown a PDF.
|
|
|
|
Args:
|
|
markdown_text: Contenido en formato markdown.
|
|
output_path: Ruta donde se guardará el PDF.
|
|
|
|
Returns:
|
|
Path: Ruta del archivo PDF generado.
|
|
|
|
Raises:
|
|
ValueError: Si el contenido está vacío.
|
|
IOError: Si hay error al escribir el archivo.
|
|
"""
|
|
if not markdown_text or not markdown_text.strip():
|
|
logger.warning("markdown_to_pdf llamado con contenido vacío")
|
|
raise ValueError("El contenido markdown no puede estar vacío")
|
|
|
|
logger.info(
|
|
"Convirtiendo markdown a PDF",
|
|
extra={
|
|
"content_length": len(markdown_text),
|
|
"output_path": str(output_path),
|
|
},
|
|
)
|
|
|
|
try:
|
|
# Crear documento
|
|
doc = SimpleDocTemplate(
|
|
str(output_path),
|
|
pagesize=A4,
|
|
leftMargin=2 * cm,
|
|
rightMargin=2 * cm,
|
|
topMargin=2 * cm,
|
|
bottomMargin=2 * cm,
|
|
)
|
|
|
|
# Convertir markdown a elementos
|
|
elements = self._parse_markdown_basic(markdown_text)
|
|
|
|
# Generar PDF
|
|
doc.build(elements)
|
|
|
|
logger.info(
|
|
"PDF generado exitosamente",
|
|
extra={"output_path": str(output_path), "pages": "unknown"},
|
|
)
|
|
|
|
return output_path
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error al generar PDF desde markdown: {e}")
|
|
raise IOError(f"Error al generar PDF: {e}") from e
|
|
|
|
def text_to_pdf(self, text: str, output_path: Path) -> Path:
|
|
"""
|
|
Convierte texto plano a PDF.
|
|
|
|
Args:
|
|
text: Contenido de texto plano.
|
|
output_path: Ruta donde se guardará el PDF.
|
|
|
|
Returns:
|
|
Path: Ruta del archivo PDF generado.
|
|
|
|
Raises:
|
|
ValueError: Si el contenido está vacío.
|
|
IOError: Si hay error al escribir el archivo.
|
|
"""
|
|
if not text or not text.strip():
|
|
logger.warning("text_to_pdf llamado con contenido vacío")
|
|
raise ValueError("El contenido de texto no puede estar vacío")
|
|
|
|
logger.info(
|
|
"Convirtiendo texto a PDF",
|
|
extra={
|
|
"content_length": len(text),
|
|
"output_path": str(output_path),
|
|
},
|
|
)
|
|
|
|
try:
|
|
# Crear documento
|
|
doc = SimpleDocTemplate(
|
|
str(output_path),
|
|
pagesize=A4,
|
|
leftMargin=2 * cm,
|
|
rightMargin=2 * cm,
|
|
topMargin=2 * cm,
|
|
bottomMargin=2 * cm,
|
|
)
|
|
|
|
# Convertir texto a párrafos (uno por línea)
|
|
elements: list[Union[Paragraph, Spacer]] = []
|
|
lines = text.split("\n")
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
elements.append(Spacer(1, 0.3 * cm))
|
|
else:
|
|
escaped = self._escape_xml(line)
|
|
elements.append(Paragraph(escaped, self._styles["CustomNormal"]))
|
|
|
|
# Generar PDF
|
|
doc.build(elements)
|
|
|
|
logger.info(
|
|
"PDF generado exitosamente",
|
|
extra={"output_path": str(output_path), "pages": "unknown"},
|
|
)
|
|
|
|
return output_path
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error al generar PDF desde texto: {e}")
|
|
raise IOError(f"Error al generar PDF: {e}") from e
|
|
|
|
|
|
# Instancia global del generador
|
|
pdf_generator = PDFGenerator()
|