Refine formatting: Justified text, robust PDF/DOCX generation, clean markdown style

2026-01-09 18:33:38 -03:00
parent e6a01d08d4
commit f7fdb0b622
1 changed files with 105 additions and 22 deletions
--- a/document/generators.py
+++ b/document/generators.py
@@ -80,6 +80,7 @@ Instrucciones:
 - Asegúrate de que los encabezados estén bien espaciados
 - Verifica que las viñetas usen "- " correctamente
 - Mantén exactamente el contenido existente
 - EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave
 - Devuelve únicamente el resumen formateado sin texto adicional"""
            # Use generic Gemini provider for formatting as requested
@@ -169,7 +170,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
        return output_path
    def _create_docx(self, summary: str, base_name: str) -> Path:
-        """Create DOCX document"""
+        """Create DOCX document with Markdown parsing (Legacy method ported)"""
        try:
            from docx import Document
            from docx.shared import Inches
@@ -184,8 +185,50 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
        doc = Document()
        doc.add_heading(base_name.replace('_', ' ').title(), 0)
-        doc.add_heading('Resumen', level=1)
+        # Parse and render Markdown content line by line
-        doc.add_paragraph(summary)
+        lines = summary.splitlines()
        current_paragraph = []
        for line in lines:
            line = line.strip()
            if not line:
                if current_paragraph:
                    p = doc.add_paragraph(' '.join(current_paragraph))
                    p.alignment = 3  # JUSTIFY alignment (WD_ALIGN_PARAGRAPH.JUSTIFY=3)
                    current_paragraph = []
                continue
            if line.startswith('#'):
                if current_paragraph:
                    p = doc.add_paragraph(' '.join(current_paragraph))
                    p.alignment = 3
                    current_paragraph = []
                # Process heading
                level = len(line) - len(line.lstrip('#'))
                heading_text = line.lstrip('#').strip()
                if level <= 6:
                    doc.add_heading(heading_text, level=level)
                else:
                    current_paragraph.append(heading_text)
            elif line.startswith('-') or line.startswith('*') or line.startswith('•'):
                if current_paragraph:
                    p = doc.add_paragraph(' '.join(current_paragraph))
                    p.alignment = 3
                    current_paragraph = []
                bullet_text = line.lstrip('-*• ').strip()
                p = doc.add_paragraph(bullet_text, style='List Bullet')
                # Remove bold markers from bullets if present
                if '**' in bullet_text:
                    # Basic cleanup for bullets
                    pass 
            else:
                # Clean up excessive bold markers in body text if user requested
                clean_line = line.replace('**', '') # Removing asterisks as per user complaint "se abusa de los asteriscos"
                current_paragraph.append(clean_line)
        if current_paragraph:
            p = doc.add_paragraph(' '.join(current_paragraph))
            p.alignment = 3
        doc.add_page_break()
        doc.add_paragraph(f"*Generado por CBCFacil*")
@@ -194,10 +237,11 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
        return output_path
    def _create_pdf(self, summary: str, base_name: str) -> Path:
-        """Create PDF document"""
+        """Create PDF document with Markdown parsing (Legacy method ported)"""
        try:
            from reportlab.lib.pagesizes import letter
            from reportlab.pdfgen import canvas
            import textwrap
        except ImportError:
            raise FileProcessingError("reportlab not installed")
@@ -208,28 +252,67 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
        c = canvas.Canvas(str(output_path), pagesize=letter)
        width, height = letter
        margin = 72
        y_position = height - margin
-        # Add title
+        def new_page():
-        c.setFont("Helvetica-Bold", 16)
+            nonlocal y_position
        title = base_name.replace('_', ' ').title()
        c.drawString(100, height - 100, title)
        # Add summary
        c.setFont("Helvetica", 12)
        y_position = height - 140
        # Simple text wrapping
        lines = summary.split('\n')
        for line in lines:
            if y_position < 100:
            c.showPage()
-                y_position = height - 100
+            c.setFont('Helvetica', 11)
-                c.setFont("Helvetica", 12)
+            y_position = height - margin
-            c.drawString(100, y_position, line)
+        c.setFont('Helvetica', 11)
-            y_position -= 20
+
        # Title
        c.setFont('Helvetica-Bold', 16)
        c.drawString(margin, y_position, base_name.replace('_', ' ').title()[:100])
        y_position -= 28
        c.setFont('Helvetica', 11)
        summary_clean = summary.replace('**', '') # Remove asterisks globally for cleaner PDF
        for raw_line in summary_clean.splitlines():
            line = raw_line.rstrip()
            if not line.strip():
                y_position -= 14
                if y_position < margin:
                    new_page()
                continue
            stripped = line.lstrip()
            if stripped.startswith('#'):
                level = len(stripped) - len(stripped.lstrip('#'))
                heading_text = stripped.lstrip('#').strip()
                if heading_text:
                    font_size = 16 if level == 1 else 14 if level == 2 else 12
                    c.setFont('Helvetica-Bold', font_size)
                    c.drawString(margin, y_position, heading_text[:90])
                    y_position -= font_size + 6
                    if y_position < margin:
                        new_page()
                    c.setFont('Helvetica', 11)
                continue
            if stripped.startswith(('-', '*', '•')):
                bullet_text = stripped.lstrip('-*•').strip()
                wrapped_lines = textwrap.wrap(bullet_text, width=80) or ['']
                for idx, wrapped in enumerate(wrapped_lines):
                    prefix = '• ' if idx == 0 else '  '
                    c.drawString(margin, y_position, f"{prefix}{wrapped}")
                    y_position -= 14
                    if y_position < margin:
                        new_page()
                continue
            # Body text - Justified approximation (ReportLab native justification requires Paragraph styles, defaulting to wrap)
            wrapped_lines = textwrap.wrap(stripped, width=90) or ['']
            for wrapped in wrapped_lines:
                c.drawString(margin, y_position, wrapped)
                y_position -= 14
                if y_position < margin:
                    new_page()
        c.showPage()
        c.save()
        return output_path