Initial commit - cleaned for CV
This commit is contained in:
7
document/__init__.py
Normal file
7
document/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
Document generation package for CBCFacil
|
||||
"""
|
||||
|
||||
from .generators import DocumentGenerator
|
||||
|
||||
__all__ = ['DocumentGenerator']
|
||||
669
document/generators.py
Normal file
669
document/generators.py
Normal file
@@ -0,0 +1,669 @@
|
||||
"""
|
||||
Document generation utilities - LaTeX Academic Summary System
|
||||
|
||||
This module generates comprehensive academic summaries in LaTeX format
|
||||
following the specifications in latex/resumen.md (the SINGLE SOURCE OF TRUTH).
|
||||
|
||||
Parallel Processing: Uses multiple agents for accelerated summary generation:
|
||||
- AI Provider Racing: Multiple AI providers generate in parallel
|
||||
- Parallel Format Conversion: PDF + DOCX generated simultaneously
|
||||
- Background Notion Uploads: Non-blocking uploads to Notion
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import shutil
|
||||
import re
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, Tuple, Callable
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from core import FileProcessingError
|
||||
from config import settings
|
||||
from services.ai import ai_provider_factory
|
||||
from services.ai.prompt_manager import prompt_manager
|
||||
|
||||
|
||||
def _sanitize_latex(latex_code: str) -> str:
|
||||
"""
|
||||
Pre-process LaTeX code to fix common errors before compilation.
|
||||
|
||||
This function applies automated fixes for known issues that AI models
|
||||
frequently generate, reducing the need for fix_latex() iterations.
|
||||
|
||||
Currently handles:
|
||||
- TikZ nodes with line breaks (\\\\) missing align=center
|
||||
- Unbalanced environments (best effort)
|
||||
"""
|
||||
if not latex_code:
|
||||
return latex_code
|
||||
|
||||
result = latex_code
|
||||
|
||||
# Fix TikZ nodes with \\\\ but missing align=center
|
||||
# Pattern: \node[...] (name) {Text\\More};
|
||||
# This is a common AI error - TikZ requires align=center for \\\\ in nodes
|
||||
|
||||
# We need to find \node commands and add align=center if they have \\\\ in content
|
||||
# but don't already have align= in their options
|
||||
|
||||
def fix_tikz_node(match):
|
||||
"""Fix a single TikZ node by adding align=center if needed"""
|
||||
full_match = match.group(0)
|
||||
options = match.group(1) # Content inside [...]
|
||||
rest = match.group(2) # Everything after options
|
||||
|
||||
# Check if this node has \\\\ in its content (text between { })
|
||||
# and doesn't already have align=
|
||||
if "\\\\" in rest and "align=" not in options:
|
||||
# Add align=center to the options
|
||||
if options.strip():
|
||||
new_options = options.rstrip() + ", align=center"
|
||||
else:
|
||||
new_options = "align=center"
|
||||
return f"\\node[{new_options}]{rest}"
|
||||
|
||||
return full_match
|
||||
|
||||
# Match \node[options] followed by rest of the line
|
||||
# Capture options and the rest separately
|
||||
tikz_node_pattern = r"\\node\[([^\]]*)\]([^;]*;)"
|
||||
result = re.sub(tikz_node_pattern, fix_tikz_node, result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class DocumentGenerator:
|
||||
"""
|
||||
Generates academic summary documents in LaTeX format.
|
||||
|
||||
The system follows these principles:
|
||||
1. latex/resumen.md is the SINGLE SOURCE OF TRUTH for prompt structure
|
||||
2. Generates full LaTeX documents (not Markdown)
|
||||
3. Compiles to PDF using pdflatex
|
||||
4. Supports iterative fixing with AI if compilation fails
|
||||
5. Supports progress notifications via callback
|
||||
"""
|
||||
|
||||
def __init__(self, notification_callback: Optional[Callable[[str], None]] = None):
|
||||
"""
|
||||
Initialize DocumentGenerator.
|
||||
|
||||
Args:
|
||||
notification_callback: Optional callback function for progress notifications
|
||||
Takes a single string argument (message to send)
|
||||
"""
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.ai_provider = ai_provider_factory.get_best_provider()
|
||||
self.notification_callback = notification_callback
|
||||
self.use_parallel = ai_provider_factory.use_parallel()
|
||||
self.executor = ThreadPoolExecutor(max_workers=4)
|
||||
|
||||
# Ensure output directories exist
|
||||
settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True)
|
||||
settings.LOCAL_DOCX.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if self.use_parallel:
|
||||
self.logger.info(
|
||||
"🚀 Parallel processing enabled: Multiple AI providers available"
|
||||
)
|
||||
|
||||
def _notify(self, message: str) -> None:
|
||||
"""Send notification if callback is configured"""
|
||||
if self.notification_callback:
|
||||
try:
|
||||
self.notification_callback(message)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to send notification: {e}")
|
||||
|
||||
def _generate_with_parallel_provider(self, prompt: str, **kwargs) -> str:
|
||||
"""
|
||||
Generate content using multiple AI providers in parallel.
|
||||
|
||||
Races multiple providers and returns the first successful response,
|
||||
or the best quality response if using consensus strategy.
|
||||
"""
|
||||
try:
|
||||
parallel_provider = ai_provider_factory.get_parallel_provider(max_workers=4)
|
||||
self.logger.info("🚀 Using parallel AI provider (race mode)")
|
||||
|
||||
result = parallel_provider.generate_parallel(
|
||||
prompt=prompt,
|
||||
strategy="race", # Use first successful response
|
||||
timeout_ms=300000, # 5 minutes
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
f"✅ Parallel generation complete: {result.selected_provider} selected, "
|
||||
f"{result.total_duration_ms}ms"
|
||||
)
|
||||
|
||||
return result.content
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
f"⚠️ Parallel generation failed: {e}, falling back to single provider"
|
||||
)
|
||||
return self.ai_provider.generate_text(prompt, **kwargs)
|
||||
|
||||
def _convert_formats_parallel(
|
||||
self, tex_path: Path, pdf_path: Optional[Path], base_name: str
|
||||
) -> Optional[Path]:
|
||||
"""
|
||||
Convert to multiple formats in parallel (DOCX, optionally PDF).
|
||||
|
||||
If PDF is already compiled, only DOCX is generated.
|
||||
Otherwise, both PDF and DOCX are generated in parallel.
|
||||
"""
|
||||
futures = {}
|
||||
|
||||
# Generate DOCX
|
||||
if shutil.which("pandoc"):
|
||||
futures["docx"] = self.executor.submit(
|
||||
self._convert_tex_to_docx, tex_path, base_name
|
||||
)
|
||||
|
||||
# Wait for DOCX completion
|
||||
docx_path = None
|
||||
if "docx" in futures:
|
||||
try:
|
||||
docx_path = futures["docx"].result(timeout=60)
|
||||
if docx_path:
|
||||
self.logger.info(f"✅ Parallel DOCX generated: {docx_path}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"⚠️ DOCX generation failed: {e}")
|
||||
|
||||
return docx_path
|
||||
|
||||
def _upload_to_notion_background(
|
||||
self,
|
||||
base_name: str,
|
||||
summary: str,
|
||||
pdf_path: Optional[Path],
|
||||
metadata: Dict[str, Any],
|
||||
):
|
||||
"""Upload to Notion in background thread (non-blocking)."""
|
||||
|
||||
def upload_worker():
|
||||
try:
|
||||
from services.notion_service import notion_service
|
||||
|
||||
title = base_name.replace("_", " ").title()
|
||||
notion_metadata = {
|
||||
"file_type": "Audio",
|
||||
"pdf_path": pdf_path or Path(""),
|
||||
"add_status": False,
|
||||
"use_as_page": False,
|
||||
}
|
||||
|
||||
page_id = notion_service.create_page_with_summary(
|
||||
title=title, summary=summary, metadata=notion_metadata
|
||||
)
|
||||
|
||||
if page_id:
|
||||
metadata["notion_uploaded"] = True
|
||||
metadata["notion_page_id"] = page_id
|
||||
self.logger.info(
|
||||
f"✅ Background upload to Notion complete: {title}"
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"⚠️ Background Notion upload failed: {title}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"❌ Background Notion upload error: {e}")
|
||||
|
||||
# Start background thread
|
||||
thread = threading.Thread(target=upload_worker, daemon=True)
|
||||
thread.start()
|
||||
self.logger.info("🔄 Notion upload started in background")
|
||||
|
||||
def generate_summary(
|
||||
self,
|
||||
text: str,
|
||||
base_name: str,
|
||||
materia: str = "Economía",
|
||||
bibliographic_text: Optional[str] = None,
|
||||
class_number: Optional[int] = None,
|
||||
) -> Tuple[bool, str, Dict[str, Any]]:
|
||||
"""
|
||||
Generate comprehensive academic summary in LaTeX format.
|
||||
|
||||
Args:
|
||||
text: The class transcription text
|
||||
base_name: Base filename for output files
|
||||
materia: Subject name (default: "Economía")
|
||||
bibliographic_text: Optional supporting material from books/notes
|
||||
class_number: Optional class number for header
|
||||
|
||||
Returns:
|
||||
Tuple of (success, summary_text, metadata)
|
||||
"""
|
||||
self.logger.info(
|
||||
f"🚀 Starting LaTeX academic summary generation for: {base_name}"
|
||||
)
|
||||
|
||||
metadata = {
|
||||
"filename": base_name,
|
||||
"tex_path": "",
|
||||
"pdf_path": "",
|
||||
"markdown_path": "",
|
||||
"docx_path": "",
|
||||
"summary_snippet": "",
|
||||
"notion_uploaded": False,
|
||||
"notion_page_id": None,
|
||||
"materia": materia,
|
||||
}
|
||||
|
||||
try:
|
||||
# === STEP 1: Generate LaTeX content using AI ===
|
||||
self.logger.info(
|
||||
"🧠 Sending request to AI Provider for LaTeX generation..."
|
||||
)
|
||||
self._notify("📝 Preparando prompt de resumen académico...")
|
||||
|
||||
prompt = prompt_manager.get_latex_summary_prompt(
|
||||
transcription=text,
|
||||
materia=materia,
|
||||
bibliographic_text=bibliographic_text,
|
||||
class_number=class_number,
|
||||
)
|
||||
|
||||
self._notify(
|
||||
"🧠 Enviando solicitud a la IA (esto puede tardar unos minutos)..."
|
||||
)
|
||||
|
||||
# Use parallel provider if multiple AI providers are available
|
||||
if self.use_parallel:
|
||||
raw_response = self._generate_with_parallel_provider(prompt)
|
||||
else:
|
||||
raw_response = self.ai_provider.generate_text(prompt)
|
||||
|
||||
if not raw_response:
|
||||
raise FileProcessingError("AI returned empty response")
|
||||
|
||||
self.logger.info(f"📝 AI response received: {len(raw_response)} characters")
|
||||
self._notify(f"✅ Respuesta recibida ({len(raw_response)} caracteres)")
|
||||
|
||||
# === STEP 2: Extract clean LaTeX from AI response ===
|
||||
self._notify("🔍 Extrayendo código LaTeX...")
|
||||
|
||||
latex_content = prompt_manager.extract_latex_from_response(raw_response)
|
||||
|
||||
if not latex_content:
|
||||
self.logger.warning(
|
||||
"⚠️ No valid LaTeX found in response, treating as Markdown"
|
||||
)
|
||||
self._notify("⚠️ No se detectó LaTeX válido, usando modo compatible...")
|
||||
# Fallback to Markdown processing
|
||||
return self._fallback_to_markdown(raw_response, base_name, metadata)
|
||||
|
||||
self.logger.info("✨ Valid LaTeX content detected")
|
||||
self._notify(f"✨ LaTeX detectado: {len(latex_content)} caracteres")
|
||||
|
||||
# === STEP 3: Compilation Loop with Self-Correction ===
|
||||
max_retries = 3
|
||||
current_latex = latex_content
|
||||
|
||||
for attempt in range(max_retries + 1):
|
||||
# Sanitize LaTeX before saving (fix common AI errors like TikZ nodes)
|
||||
current_latex = _sanitize_latex(current_latex)
|
||||
|
||||
# Save current .tex file
|
||||
self._notify(
|
||||
f"📄 Guardando archivo .tex (intento {attempt + 1}/{max_retries + 1})..."
|
||||
)
|
||||
|
||||
tex_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.tex"
|
||||
tex_path.write_text(current_latex, encoding="utf-8")
|
||||
metadata["tex_path"] = str(tex_path)
|
||||
|
||||
# Try to compile
|
||||
self._notify("⚙️ Primera pasada de compilación LaTeX...")
|
||||
|
||||
pdf_path = self._compile_latex(
|
||||
tex_path, output_dir=settings.LOCAL_DOWNLOADS_PATH
|
||||
)
|
||||
|
||||
if pdf_path:
|
||||
self.logger.info(
|
||||
f"✅ Compilation success on attempt {attempt + 1}!"
|
||||
)
|
||||
self._notify("✅ PDF generado exitosamente!")
|
||||
metadata["pdf_path"] = str(pdf_path)
|
||||
|
||||
# Generate DOCX in parallel
|
||||
self._notify("📄 Generando archivo DOCX en paralelo...")
|
||||
docx_path = self._convert_formats_parallel(
|
||||
tex_path, pdf_path, base_name
|
||||
)
|
||||
if docx_path:
|
||||
self._notify("✅ DOCX generado exitosamente!")
|
||||
metadata["docx_path"] = str(docx_path)
|
||||
|
||||
# Create a text summary for Notion/preview
|
||||
text_summary = self._create_text_summary(current_latex)
|
||||
metadata["summary_snippet"] = text_summary[:500] + "..."
|
||||
|
||||
# Upload to Notion in background if configured
|
||||
if settings.has_notion_config:
|
||||
self._notify("📤 Iniciando carga a Notion en segundo plano...")
|
||||
self._upload_to_notion_background(
|
||||
base_name=base_name,
|
||||
summary=text_summary,
|
||||
pdf_path=pdf_path,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
self._notify("🎉 ¡Resumen completado con éxito!")
|
||||
return True, text_summary, metadata
|
||||
|
||||
# Compilation failed - ask AI to fix
|
||||
if attempt < max_retries:
|
||||
self.logger.warning(
|
||||
f"⚠️ Compilation failed (Attempt {attempt + 1}/{max_retries + 1}). "
|
||||
f"Requesting AI fix..."
|
||||
)
|
||||
self._notify(
|
||||
f"⚠️ Error de compilación ({attempt + 1}/{max_retries + 1}), solicitando corrección a IA..."
|
||||
)
|
||||
|
||||
# Get error log
|
||||
log_file = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.log"
|
||||
error_log = "Log file not found"
|
||||
if log_file.exists():
|
||||
error_log = log_file.read_text(
|
||||
encoding="utf-8", errors="ignore"
|
||||
)[-2000:]
|
||||
|
||||
# Ask AI to fix
|
||||
try:
|
||||
self._notify("🔧 La IA está corrigiendo el código LaTeX...")
|
||||
if hasattr(self.ai_provider, "fix_latex"):
|
||||
fixed_latex = self.ai_provider.fix_latex(
|
||||
current_latex, error_log
|
||||
)
|
||||
cleaned = prompt_manager.extract_latex_from_response(
|
||||
fixed_latex
|
||||
)
|
||||
if cleaned:
|
||||
current_latex = cleaned
|
||||
else:
|
||||
current_latex = fixed_latex
|
||||
self._notify(
|
||||
"✅ Código LaTeX corregido, reintentando compilación..."
|
||||
)
|
||||
else:
|
||||
self.logger.error(
|
||||
"❌ AI provider doesn't support fix_latex()"
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.error(f"❌ AI fix request failed: {e}")
|
||||
break
|
||||
else:
|
||||
self.logger.error(
|
||||
"❌ Max retries reached. LaTeX compilation failed."
|
||||
)
|
||||
self._notify(
|
||||
"❌ No se pudo compilar el LaTeX después de varios intentos"
|
||||
)
|
||||
|
||||
# If we get here, all compilation attempts failed
|
||||
self._notify("⚠️ Usando modo de compatibilidad Markdown...")
|
||||
return self._fallback_to_markdown(
|
||||
current_latex or raw_response, base_name, metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"❌ Critical error in document generation: {e}", exc_info=True
|
||||
)
|
||||
self._notify(f"❌ Error en la generación: {str(e)[:100]}")
|
||||
return False, "", metadata
|
||||
|
||||
def _compile_latex(self, tex_path: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""
|
||||
Compile LaTeX to PDF using pdflatex. Runs twice for TOC.
|
||||
|
||||
Args:
|
||||
tex_path: Path to .tex file
|
||||
output_dir: Directory for output files
|
||||
|
||||
Returns:
|
||||
Path to generated PDF or None if failed
|
||||
"""
|
||||
base_name = tex_path.stem
|
||||
expected_pdf = output_dir / f"{base_name}.pdf"
|
||||
|
||||
# Check if pdflatex is available
|
||||
if not shutil.which("pdflatex"):
|
||||
self.logger.error("🚫 pdflatex not found in system PATH")
|
||||
return None
|
||||
|
||||
cmd = [
|
||||
"pdflatex",
|
||||
"-interaction=nonstopmode",
|
||||
"-halt-on-error",
|
||||
f"-output-directory={output_dir}",
|
||||
str(tex_path),
|
||||
]
|
||||
|
||||
try:
|
||||
# Pass 1
|
||||
self.logger.info("⚙️ Compiling LaTeX (Pass 1/2)...")
|
||||
subprocess.run(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
# Pass 2 (for TOC resolution)
|
||||
self.logger.info("⚙️ Compiling LaTeX (Pass 2/2)...")
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
if result.returncode == 0 and expected_pdf.exists():
|
||||
self.logger.info(f"✅ PDF generated: {expected_pdf}")
|
||||
self._cleanup_latex_aux(output_dir, base_name)
|
||||
return expected_pdf
|
||||
else:
|
||||
# Read log file for error info
|
||||
log_file = output_dir / f"{base_name}.log"
|
||||
error_snippet = "Unknown error"
|
||||
if log_file.exists():
|
||||
try:
|
||||
log_content = log_file.read_text(
|
||||
encoding="utf-8", errors="ignore"
|
||||
)
|
||||
errors = [
|
||||
line
|
||||
for line in log_content.splitlines()
|
||||
if line.startswith("!")
|
||||
]
|
||||
if errors:
|
||||
error_snippet = errors[0][:200]
|
||||
except:
|
||||
pass
|
||||
|
||||
self.logger.error(f"❌ LaTeX compilation failed: {error_snippet}")
|
||||
return None
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
self.logger.error("❌ LaTeX compilation timed out")
|
||||
return None
|
||||
except Exception as e:
|
||||
self.logger.error(f"❌ Error during LaTeX execution: {e}")
|
||||
return None
|
||||
|
||||
def _convert_tex_to_docx(self, tex_path: Path, base_name: str) -> Optional[Path]:
|
||||
"""Convert .tex to .docx using Pandoc."""
|
||||
if not shutil.which("pandoc"):
|
||||
self.logger.warning("⚠️ pandoc not found, skipping DOCX generation")
|
||||
return None
|
||||
|
||||
docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
|
||||
cmd = ["pandoc", str(tex_path), "-o", str(docx_path)]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
||||
self.logger.info(f"✅ DOCX generated: {docx_path}")
|
||||
return docx_path
|
||||
except Exception as e:
|
||||
self.logger.warning(f"⚠️ DOCX generation failed: {e}")
|
||||
return None
|
||||
|
||||
def _create_text_summary(self, latex_content: str) -> str:
|
||||
"""Extract a plain text summary from LaTeX content for Notion/preview."""
|
||||
# Remove LaTeX commands and keep content
|
||||
text = latex_content
|
||||
|
||||
# Remove document class and packages
|
||||
text = re.sub(r"\\documentclass\[?[^\]]*\]?\{[^\}]+\}", "", text)
|
||||
text = re.sub(r"\\usepackage\{[^\}]+\}", "", text)
|
||||
text = re.sub(r"\\geometry\{[^\}]+\}", "", text)
|
||||
text = re.sub(r"\\pagestyle\{[^\}]+\}", "", text)
|
||||
text = re.sub(r"\\fancyhf\{\}", "", text)
|
||||
text = re.sub(r"\\fancyhead\[?[^\]]*\]?\{[^\}]+\}", "", text)
|
||||
text = re.sub(r"\\fancyfoot\[?[^\]]*\]?\{[^\}]+\}", "", text)
|
||||
|
||||
# Convert sections to markdown-style
|
||||
text = re.sub(r"\\section\*?\{([^\}]+)\}", r"# \1", text)
|
||||
text = re.sub(r"\\subsection\*?\{([^\}]+)\}", r"## \1", text)
|
||||
text = re.sub(r"\\subsubsection\*?\{([^\}]+)\}", r"### \1", text)
|
||||
|
||||
# Remove tcolorbox environments (keep content)
|
||||
text = re.sub(
|
||||
r"\\begin\{(definicion|importante|ejemplo)\}\[?[^\]]*\]?",
|
||||
r"\n**\1:** ",
|
||||
text,
|
||||
)
|
||||
text = re.sub(r"\\end\{(definicion|importante|ejemplo)\}", "", text)
|
||||
|
||||
# Convert itemize to bullets
|
||||
text = re.sub(r"\\item\s*", "- ", text)
|
||||
text = re.sub(r"\\begin\{(itemize|enumerate)\}", "", text)
|
||||
text = re.sub(r"\\end\{(itemize|enumerate)\}", "", text)
|
||||
|
||||
# Clean up math (basic)
|
||||
text = re.sub(r"\$\$([^\$]+)\$\$", r"\n\n\1\n\n", text)
|
||||
text = re.sub(r"\$([^\$]+)\$", r"\1", text)
|
||||
|
||||
# Remove remaining LaTeX commands
|
||||
text = re.sub(r"\\[a-zA-Z]+(\{[^\}]*\})*", "", text)
|
||||
text = re.sub(r"[{}]", "", text)
|
||||
|
||||
# Clean whitespace
|
||||
text = re.sub(r"\n\s*\n\s*\n", "\n\n", text)
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
def _fallback_to_markdown(
|
||||
self, content: str, base_name: str, metadata: Dict[str, Any]
|
||||
) -> Tuple[bool, str, Dict[str, Any]]:
|
||||
"""Fallback when LaTeX generation fails."""
|
||||
self.logger.warning("⚠️ Falling back to Markdown processing")
|
||||
|
||||
md_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_resumen.md"
|
||||
md_path.write_text(content, encoding="utf-8")
|
||||
metadata["markdown_path"] = str(md_path)
|
||||
|
||||
# Try to convert to PDF via pandoc
|
||||
if shutil.which("pandoc"):
|
||||
pdf_path = self._convert_md_to_pdf(md_path, base_name)
|
||||
if pdf_path:
|
||||
metadata["pdf_path"] = str(pdf_path)
|
||||
|
||||
docx_path = self._convert_md_to_docx(md_path, base_name)
|
||||
if docx_path:
|
||||
metadata["docx_path"] = str(docx_path)
|
||||
|
||||
metadata["summary_snippet"] = content[:500] + "..."
|
||||
return True, content, metadata
|
||||
|
||||
def _convert_md_to_pdf(self, md_path: Path, base_name: str) -> Optional[Path]:
|
||||
"""Convert Markdown to PDF using pandoc."""
|
||||
pdf_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.pdf"
|
||||
cmd = [
|
||||
"pandoc",
|
||||
str(md_path),
|
||||
"-o",
|
||||
str(pdf_path),
|
||||
"--pdf-engine=pdflatex",
|
||||
"-V",
|
||||
"geometry:margin=2.5cm",
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
||||
self.logger.info(f"✅ PDF from Markdown: {pdf_path}")
|
||||
return pdf_path
|
||||
except Exception as e:
|
||||
self.logger.warning(f"⚠️ PDF from Markdown failed: {e}")
|
||||
return None
|
||||
|
||||
def _convert_md_to_docx(self, md_path: Path, base_name: str) -> Optional[Path]:
|
||||
"""Convert Markdown to DOCX using pandoc."""
|
||||
docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
|
||||
cmd = ["pandoc", str(md_path), "-o", str(docx_path)]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
||||
self.logger.info(f"✅ DOCX from Markdown: {docx_path}")
|
||||
return docx_path
|
||||
except Exception as e:
|
||||
self.logger.warning(f"⚠️ DOCX from Markdown failed: {e}")
|
||||
return None
|
||||
|
||||
def _cleanup_latex_aux(self, output_dir: Path, base_name: str):
|
||||
"""Clean up auxiliary LaTeX files."""
|
||||
extensions = [".aux", ".log", ".out", ".toc"]
|
||||
for ext in extensions:
|
||||
aux_file = output_dir / f"{base_name}{ext}"
|
||||
if aux_file.exists():
|
||||
try:
|
||||
aux_file.unlink()
|
||||
except:
|
||||
pass
|
||||
|
||||
def _upload_to_notion(
|
||||
self,
|
||||
base_name: str,
|
||||
summary: str,
|
||||
pdf_path: Optional[Path],
|
||||
metadata: Dict[str, Any],
|
||||
):
|
||||
"""Upload summary to Notion if configured."""
|
||||
try:
|
||||
from services.notion_service import notion_service
|
||||
|
||||
title = base_name.replace("_", " ").title()
|
||||
notion_metadata = {
|
||||
"file_type": "Audio",
|
||||
"pdf_path": pdf_path or Path(""),
|
||||
"add_status": False,
|
||||
"use_as_page": False,
|
||||
}
|
||||
|
||||
page_id = notion_service.create_page_with_summary(
|
||||
title=title, summary=summary, metadata=notion_metadata
|
||||
)
|
||||
|
||||
if page_id:
|
||||
metadata["notion_uploaded"] = True
|
||||
metadata["notion_page_id"] = page_id
|
||||
self.logger.info(f"✅ Uploaded to Notion: {title}")
|
||||
else:
|
||||
self.logger.warning(f"⚠️ Notion upload failed: {title}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"❌ Notion upload error: {e}")
|
||||
Reference in New Issue
Block a user