From f7fdb0b622326c2e0bad86eceb2a059988867137 Mon Sep 17 00:00:00 2001 From: renato97 Date: Fri, 9 Jan 2026 18:33:38 -0300 Subject: [PATCH] Refine formatting: Justified text, robust PDF/DOCX generation, clean markdown style --- document/generators.py | 127 ++++++++++++++++++++++++++++++++++------- 1 file changed, 105 insertions(+), 22 deletions(-) diff --git a/document/generators.py b/document/generators.py index 4f4d576..f4f3d59 100644 --- a/document/generators.py +++ b/document/generators.py @@ -80,6 +80,7 @@ Instrucciones: - Asegúrate de que los encabezados estén bien espaciados - Verifica que las viñetas usen "- " correctamente - Mantén exactamente el contenido existente +- EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave - Devuelve únicamente el resumen formateado sin texto adicional""" # Use generic Gemini provider for formatting as requested @@ -169,7 +170,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:""" return output_path def _create_docx(self, summary: str, base_name: str) -> Path: - """Create DOCX document""" + """Create DOCX document with Markdown parsing (Legacy method ported)""" try: from docx import Document from docx.shared import Inches @@ -184,8 +185,50 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:""" doc = Document() doc.add_heading(base_name.replace('_', ' ').title(), 0) - doc.add_heading('Resumen', level=1) - doc.add_paragraph(summary) + # Parse and render Markdown content line by line + lines = summary.splitlines() + current_paragraph = [] + + for line in lines: + line = line.strip() + if not line: + if current_paragraph: + p = doc.add_paragraph(' '.join(current_paragraph)) + p.alignment = 3 # JUSTIFY alignment (WD_ALIGN_PARAGRAPH.JUSTIFY=3) + current_paragraph = [] + continue + + if line.startswith('#'): + if current_paragraph: + p = doc.add_paragraph(' '.join(current_paragraph)) + p.alignment = 3 + current_paragraph = [] + # Process heading + level = len(line) - len(line.lstrip('#')) + heading_text = line.lstrip('#').strip() + if level <= 6: + doc.add_heading(heading_text, level=level) + else: + current_paragraph.append(heading_text) + elif line.startswith('-') or line.startswith('*') or line.startswith('•'): + if current_paragraph: + p = doc.add_paragraph(' '.join(current_paragraph)) + p.alignment = 3 + current_paragraph = [] + bullet_text = line.lstrip('-*• ').strip() + p = doc.add_paragraph(bullet_text, style='List Bullet') + # Remove bold markers from bullets if present + if '**' in bullet_text: + # Basic cleanup for bullets + pass + else: + # Clean up excessive bold markers in body text if user requested + clean_line = line.replace('**', '') # Removing asterisks as per user complaint "se abusa de los asteriscos" + current_paragraph.append(clean_line) + + if current_paragraph: + p = doc.add_paragraph(' '.join(current_paragraph)) + p.alignment = 3 doc.add_page_break() doc.add_paragraph(f"*Generado por CBCFacil*") @@ -194,10 +237,11 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:""" return output_path def _create_pdf(self, summary: str, base_name: str) -> Path: - """Create PDF document""" + """Create PDF document with Markdown parsing (Legacy method ported)""" try: from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas + import textwrap except ImportError: raise FileProcessingError("reportlab not installed") @@ -208,28 +252,67 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:""" c = canvas.Canvas(str(output_path), pagesize=letter) width, height = letter + margin = 72 + y_position = height - margin - # Add title - c.setFont("Helvetica-Bold", 16) - title = base_name.replace('_', ' ').title() - c.drawString(100, height - 100, title) + def new_page(): + nonlocal y_position + c.showPage() + c.setFont('Helvetica', 11) + y_position = height - margin - # Add summary - c.setFont("Helvetica", 12) - y_position = height - 140 + c.setFont('Helvetica', 11) - # Simple text wrapping - lines = summary.split('\n') - for line in lines: - if y_position < 100: - c.showPage() - y_position = height - 100 - c.setFont("Helvetica", 12) + # Title + c.setFont('Helvetica-Bold', 16) + c.drawString(margin, y_position, base_name.replace('_', ' ').title()[:100]) + y_position -= 28 + c.setFont('Helvetica', 11) - c.drawString(100, y_position, line) - y_position -= 20 + summary_clean = summary.replace('**', '') # Remove asterisks globally for cleaner PDF + + for raw_line in summary_clean.splitlines(): + line = raw_line.rstrip() + + if not line.strip(): + y_position -= 14 + if y_position < margin: + new_page() + continue + + stripped = line.lstrip() + + if stripped.startswith('#'): + level = len(stripped) - len(stripped.lstrip('#')) + heading_text = stripped.lstrip('#').strip() + if heading_text: + font_size = 16 if level == 1 else 14 if level == 2 else 12 + c.setFont('Helvetica-Bold', font_size) + c.drawString(margin, y_position, heading_text[:90]) + y_position -= font_size + 6 + if y_position < margin: + new_page() + c.setFont('Helvetica', 11) + continue + + if stripped.startswith(('-', '*', '•')): + bullet_text = stripped.lstrip('-*•').strip() + wrapped_lines = textwrap.wrap(bullet_text, width=80) or [''] + for idx, wrapped in enumerate(wrapped_lines): + prefix = '• ' if idx == 0 else ' ' + c.drawString(margin, y_position, f"{prefix}{wrapped}") + y_position -= 14 + if y_position < margin: + new_page() + continue + + # Body text - Justified approximation (ReportLab native justification requires Paragraph styles, defaulting to wrap) + wrapped_lines = textwrap.wrap(stripped, width=90) or [''] + for wrapped in wrapped_lines: + c.drawString(margin, y_position, wrapped) + y_position -= 14 + if y_position < margin: + new_page() - c.showPage() c.save() - return output_path