236 lines
8.3 KiB
Python
236 lines
8.3 KiB
Python
"""
|
|
Document generation utilities
|
|
"""
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, Any, List, Tuple
|
|
from core import FileProcessingError
|
|
from config import settings
|
|
from services.ai import ai_provider_factory
|
|
|
|
|
|
class DocumentGenerator:
|
|
"""Generate documents from processed text"""
|
|
|
|
def __init__(self):
|
|
self.logger = logging.getLogger(__name__)
|
|
self.ai_provider = ai_provider_factory.get_best_provider()
|
|
|
|
def generate_summary(self, text: str, base_name: str) -> Tuple[bool, str, Dict[str, Any]]:
|
|
"""Generate unified summary"""
|
|
self.logger.info(f"Generating summary for {base_name}")
|
|
|
|
try:
|
|
# Step 1: Generate Bullet Points (Chunking handled by provider or single prompt for now)
|
|
# Note: We use the main provider (Claude/Zai) for content generation
|
|
self.logger.info("Generating bullet points...")
|
|
bullet_prompt = f"""Analiza el siguiente texto y extrae entre 5 y 8 bullet points clave en español.
|
|
|
|
REGLAS ESTRICTAS:
|
|
1. Devuelve ÚNICAMENTE bullet points, cada línea iniciando con "- "
|
|
2. Cada bullet debe ser conciso (12-20 palabras) y resaltar datos, fechas, conceptos o conclusiones importantes
|
|
3. NO agregues introducciones, conclusiones ni texto explicativo
|
|
4. Concéntrate en los puntos más importantes del texto
|
|
5. Incluye fechas, datos específicos y nombres relevantes si los hay
|
|
|
|
Texto:
|
|
{text[:15000]}""" # Truncate to avoid context limits if necessary, though providers handle it differently
|
|
|
|
try:
|
|
bullet_points = self.ai_provider.generate_text(bullet_prompt)
|
|
self.logger.info(f"Bullet points generated: {len(bullet_points)}")
|
|
except Exception as e:
|
|
self.logger.warning(f"Bullet point generation failed: {e}")
|
|
bullet_points = "- Puntos clave no disponibles por error en IA"
|
|
|
|
# Step 2: Generate Unified Summary
|
|
self.logger.info("Generating unified summary...")
|
|
summary_prompt = f"""Eres un profesor universitario experto en historia del siglo XX. Redacta un resumen académico integrado en español usando el texto y los bullet points extraídos.
|
|
|
|
REQUISITOS ESTRICTOS:
|
|
- Extensión entre 500-700 palabras
|
|
- Usa encabezados Markdown con jerarquía clara (##, ###)
|
|
- Desarrolla los puntos clave con profundidad y contexto histórico
|
|
- Mantén un tono académico y analítico
|
|
- Incluye conclusiones significativas
|
|
- NO agregues texto fuera del resumen
|
|
- Devuelve únicamente el resumen en formato Markdown
|
|
|
|
Contenido a resumir:
|
|
{text[:20000]}
|
|
|
|
Puntos clave a incluir obligatoriamente:
|
|
{bullet_points}"""
|
|
|
|
try:
|
|
raw_summary = self.ai_provider.generate_text(summary_prompt)
|
|
except Exception as e:
|
|
self.logger.error(f"Raw summary generation failed: {e}")
|
|
raise e
|
|
|
|
# Step 3: Format with Gemini (using GeminiProvider explicitly)
|
|
self.logger.info("Formatting summary with Gemini...")
|
|
format_prompt = f"""Revisa y mejora el siguiente resumen en Markdown para que sea perfectamente legible:
|
|
|
|
{raw_summary}
|
|
|
|
Instrucciones:
|
|
- Corrige cualquier error de formato
|
|
- Asegúrate de que los encabezados estén bien espaciados
|
|
- Verifica que las viñetas usen "- " correctamente
|
|
- Mantén exactamente el contenido existente
|
|
- Devuelve únicamente el resumen formateado sin texto adicional"""
|
|
|
|
# Use generic Gemini provider for formatting as requested
|
|
from services.ai.gemini_provider import GeminiProvider
|
|
formatter = GeminiProvider()
|
|
|
|
try:
|
|
if formatter.is_available():
|
|
summary = formatter.generate_text(format_prompt)
|
|
else:
|
|
self.logger.warning("Gemini formatter not available, using raw summary")
|
|
summary = raw_summary
|
|
except Exception as e:
|
|
self.logger.warning(f"Formatting failed ({e}), using raw summary")
|
|
summary = raw_summary
|
|
|
|
# Generate filename
|
|
filename = self._generate_filename(text, summary)
|
|
|
|
# Create document
|
|
markdown_path = self._create_markdown(summary, base_name)
|
|
docx_path = self._create_docx(summary, base_name)
|
|
pdf_path = self._create_pdf(summary, base_name)
|
|
|
|
metadata = {
|
|
'markdown_path': str(markdown_path),
|
|
'docx_path': str(docx_path),
|
|
'pdf_path': str(pdf_path),
|
|
'docx_name': Path(docx_path).name,
|
|
'summary': summary,
|
|
'filename': filename
|
|
}
|
|
|
|
return True, summary, metadata
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Document generation process failed: {e}")
|
|
return False, "", {}
|
|
|
|
def _generate_filename(self, text: str, summary: str) -> str:
|
|
"""Generate intelligent filename"""
|
|
try:
|
|
# Use AI to extract key topics
|
|
prompt = f"""Extract 2-3 key topics from this summary to create a filename.
|
|
Summary: {summary}
|
|
|
|
Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
|
|
|
|
topics_text = self.ai_provider.sanitize_input(prompt) if hasattr(self.ai_provider, 'sanitize_input') else summary[:100]
|
|
|
|
# Simple topic extraction
|
|
topics = re.findall(r'\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b', topics_text)[:3]
|
|
if not topics:
|
|
topics = ['documento']
|
|
|
|
# Limit topic length
|
|
topics = [t[:settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
|
|
|
|
filename = '_'.join(topics)[:settings.MAX_FILENAME_LENGTH]
|
|
return filename
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Filename generation failed: {e}")
|
|
return base_name[:settings.MAX_FILENAME_BASE_LENGTH]
|
|
|
|
def _create_markdown(self, summary: str, base_name: str) -> Path:
|
|
"""Create Markdown document"""
|
|
output_dir = settings.LOCAL_DOWNLOADS_PATH
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_path = output_dir / f"{base_name}_unificado.md"
|
|
|
|
content = f"""# {base_name.replace('_', ' ').title()}
|
|
|
|
## Resumen
|
|
|
|
{summary}
|
|
|
|
---
|
|
|
|
*Generado por CBCFacil*
|
|
"""
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
return output_path
|
|
|
|
def _create_docx(self, summary: str, base_name: str) -> Path:
|
|
"""Create DOCX document"""
|
|
try:
|
|
from docx import Document
|
|
from docx.shared import Inches
|
|
except ImportError:
|
|
raise FileProcessingError("python-docx not installed")
|
|
|
|
output_dir = settings.LOCAL_DOCX
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_path = output_dir / f"{base_name}_unificado.docx"
|
|
|
|
doc = Document()
|
|
doc.add_heading(base_name.replace('_', ' ').title(), 0)
|
|
|
|
doc.add_heading('Resumen', level=1)
|
|
doc.add_paragraph(summary)
|
|
|
|
doc.add_page_break()
|
|
doc.add_paragraph(f"*Generado por CBCFacil*")
|
|
|
|
doc.save(output_path)
|
|
return output_path
|
|
|
|
def _create_pdf(self, summary: str, base_name: str) -> Path:
|
|
"""Create PDF document"""
|
|
try:
|
|
from reportlab.lib.pagesizes import letter
|
|
from reportlab.pdfgen import canvas
|
|
except ImportError:
|
|
raise FileProcessingError("reportlab not installed")
|
|
|
|
output_dir = settings.LOCAL_DOWNLOADS_PATH
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_path = output_dir / f"{base_name}_unificado.pdf"
|
|
|
|
c = canvas.Canvas(str(output_path), pagesize=letter)
|
|
width, height = letter
|
|
|
|
# Add title
|
|
c.setFont("Helvetica-Bold", 16)
|
|
title = base_name.replace('_', ' ').title()
|
|
c.drawString(100, height - 100, title)
|
|
|
|
# Add summary
|
|
c.setFont("Helvetica", 12)
|
|
y_position = height - 140
|
|
|
|
# Simple text wrapping
|
|
lines = summary.split('\n')
|
|
for line in lines:
|
|
if y_position < 100:
|
|
c.showPage()
|
|
y_position = height - 100
|
|
c.setFont("Helvetica", 12)
|
|
|
|
c.drawString(100, y_position, line)
|
|
y_position -= 20
|
|
|
|
c.showPage()
|
|
c.save()
|
|
|
|
return output_path
|