Refine formatting: Justified text, robust PDF/DOCX generation, clean markdown style

This commit is contained in:
2026-01-09 18:33:38 -03:00
parent e6a01d08d4
commit f7fdb0b622

View File

@@ -80,6 +80,7 @@ Instrucciones:
- Asegúrate de que los encabezados estén bien espaciados
- Verifica que las viñetas usen "- " correctamente
- Mantén exactamente el contenido existente
- EVITA el uso excesivo de negritas (asteriscos), úsalas solo para conceptos clave
- Devuelve únicamente el resumen formateado sin texto adicional"""
# Use generic Gemini provider for formatting as requested
@@ -169,7 +170,7 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
return output_path
def _create_docx(self, summary: str, base_name: str) -> Path:
"""Create DOCX document"""
"""Create DOCX document with Markdown parsing (Legacy method ported)"""
try:
from docx import Document
from docx.shared import Inches
@@ -184,8 +185,50 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
doc = Document()
doc.add_heading(base_name.replace('_', ' ').title(), 0)
doc.add_heading('Resumen', level=1)
doc.add_paragraph(summary)
# Parse and render Markdown content line by line
lines = summary.splitlines()
current_paragraph = []
for line in lines:
line = line.strip()
if not line:
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p.alignment = 3 # JUSTIFY alignment (WD_ALIGN_PARAGRAPH.JUSTIFY=3)
current_paragraph = []
continue
if line.startswith('#'):
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p.alignment = 3
current_paragraph = []
# Process heading
level = len(line) - len(line.lstrip('#'))
heading_text = line.lstrip('#').strip()
if level <= 6:
doc.add_heading(heading_text, level=level)
else:
current_paragraph.append(heading_text)
elif line.startswith('-') or line.startswith('*') or line.startswith(''):
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p.alignment = 3
current_paragraph = []
bullet_text = line.lstrip('-*• ').strip()
p = doc.add_paragraph(bullet_text, style='List Bullet')
# Remove bold markers from bullets if present
if '**' in bullet_text:
# Basic cleanup for bullets
pass
else:
# Clean up excessive bold markers in body text if user requested
clean_line = line.replace('**', '') # Removing asterisks as per user complaint "se abusa de los asteriscos"
current_paragraph.append(clean_line)
if current_paragraph:
p = doc.add_paragraph(' '.join(current_paragraph))
p.alignment = 3
doc.add_page_break()
doc.add_paragraph(f"*Generado por CBCFacil*")
@@ -194,10 +237,11 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
return output_path
def _create_pdf(self, summary: str, base_name: str) -> Path:
"""Create PDF document"""
"""Create PDF document with Markdown parsing (Legacy method ported)"""
try:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import textwrap
except ImportError:
raise FileProcessingError("reportlab not installed")
@@ -208,28 +252,67 @@ Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
c = canvas.Canvas(str(output_path), pagesize=letter)
width, height = letter
margin = 72
y_position = height - margin
# Add title
c.setFont("Helvetica-Bold", 16)
title = base_name.replace('_', ' ').title()
c.drawString(100, height - 100, title)
def new_page():
nonlocal y_position
c.showPage()
c.setFont('Helvetica', 11)
y_position = height - margin
# Add summary
c.setFont("Helvetica", 12)
y_position = height - 140
c.setFont('Helvetica', 11)
# Simple text wrapping
lines = summary.split('\n')
for line in lines:
if y_position < 100:
c.showPage()
y_position = height - 100
c.setFont("Helvetica", 12)
# Title
c.setFont('Helvetica-Bold', 16)
c.drawString(margin, y_position, base_name.replace('_', ' ').title()[:100])
y_position -= 28
c.setFont('Helvetica', 11)
c.drawString(100, y_position, line)
y_position -= 20
summary_clean = summary.replace('**', '') # Remove asterisks globally for cleaner PDF
for raw_line in summary_clean.splitlines():
line = raw_line.rstrip()
if not line.strip():
y_position -= 14
if y_position < margin:
new_page()
continue
stripped = line.lstrip()
if stripped.startswith('#'):
level = len(stripped) - len(stripped.lstrip('#'))
heading_text = stripped.lstrip('#').strip()
if heading_text:
font_size = 16 if level == 1 else 14 if level == 2 else 12
c.setFont('Helvetica-Bold', font_size)
c.drawString(margin, y_position, heading_text[:90])
y_position -= font_size + 6
if y_position < margin:
new_page()
c.setFont('Helvetica', 11)
continue
if stripped.startswith(('-', '*', '')):
bullet_text = stripped.lstrip('-*•').strip()
wrapped_lines = textwrap.wrap(bullet_text, width=80) or ['']
for idx, wrapped in enumerate(wrapped_lines):
prefix = '' if idx == 0 else ' '
c.drawString(margin, y_position, f"{prefix}{wrapped}")
y_position -= 14
if y_position < margin:
new_page()
continue
# Body text - Justified approximation (ReportLab native justification requires Paragraph styles, defaulting to wrap)
wrapped_lines = textwrap.wrap(stripped, width=90) or ['']
for wrapped in wrapped_lines:
c.drawString(margin, y_position, wrapped)
y_position -= 14
if y_position < margin:
new_page()
c.showPage()
c.save()
return output_path