Cambios principales: ## Nuevos archivos - services/ai/parallel_provider.py: Ejecución paralela de múltiples proveedores AI - services/ai/prompt_manager.py: Gestión centralizada de prompts (resumen.md como fuente) - latex/resumen.md: Template del prompt para resúmenes académicos LaTeX ## Mejoras en generación LaTeX (document/generators.py) - Nueva función _sanitize_latex(): Corrige automáticamente errores comunes de AI - Agrega align=center a nodos TikZ con saltos de línea (\\) - Previene errores 'Not allowed in LR mode' antes de compilar - Soporte para procesamiento paralelo de proveedores AI - Conversión DOCX en paralelo con generación PDF - Uploads a Notion en background (non-blocking) - Callbacks de notificación para progreso en Telegram ## Mejoras en proveedores AI - claude_provider.py: fix_latex() con instrucciones específicas para errores TikZ - gemini_provider.py: fix_latex() mejorado + rate limiting + circuit breaker - provider_factory.py: Soporte para parallel provider ## Otros cambios - config/settings.py: Nuevas configuraciones para Gemini models - services/webdav_service.py: Mejoras en manejo de conexión - .gitignore: Ignora archivos LaTeX auxiliares (.aux, .toc, .out, .pdf) ## Archivos de ejemplo - latex/imperio_romano.tex, latex/clase_revolucion_rusa_crisis_30.tex - resumen_curiosidades.tex (corregido y compilado exitosamente)
670 lines
25 KiB
Python
670 lines
25 KiB
Python
"""
|
|
Document generation utilities - LaTeX Academic Summary System
|
|
|
|
This module generates comprehensive academic summaries in LaTeX format
|
|
following the specifications in latex/resumen.md (the SINGLE SOURCE OF TRUTH).
|
|
|
|
Parallel Processing: Uses multiple agents for accelerated summary generation:
|
|
- AI Provider Racing: Multiple AI providers generate in parallel
|
|
- Parallel Format Conversion: PDF + DOCX generated simultaneously
|
|
- Background Notion Uploads: Non-blocking uploads to Notion
|
|
"""
|
|
|
|
import logging
|
|
import subprocess
|
|
import shutil
|
|
import re
|
|
import threading
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional, Tuple, Callable
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
from core import FileProcessingError
|
|
from config import settings
|
|
from services.ai import ai_provider_factory
|
|
from services.ai.prompt_manager import prompt_manager
|
|
|
|
|
|
def _sanitize_latex(latex_code: str) -> str:
|
|
"""
|
|
Pre-process LaTeX code to fix common errors before compilation.
|
|
|
|
This function applies automated fixes for known issues that AI models
|
|
frequently generate, reducing the need for fix_latex() iterations.
|
|
|
|
Currently handles:
|
|
- TikZ nodes with line breaks (\\\\) missing align=center
|
|
- Unbalanced environments (best effort)
|
|
"""
|
|
if not latex_code:
|
|
return latex_code
|
|
|
|
result = latex_code
|
|
|
|
# Fix TikZ nodes with \\\\ but missing align=center
|
|
# Pattern: \node[...] (name) {Text\\More};
|
|
# This is a common AI error - TikZ requires align=center for \\\\ in nodes
|
|
|
|
# We need to find \node commands and add align=center if they have \\\\ in content
|
|
# but don't already have align= in their options
|
|
|
|
def fix_tikz_node(match):
|
|
"""Fix a single TikZ node by adding align=center if needed"""
|
|
full_match = match.group(0)
|
|
options = match.group(1) # Content inside [...]
|
|
rest = match.group(2) # Everything after options
|
|
|
|
# Check if this node has \\\\ in its content (text between { })
|
|
# and doesn't already have align=
|
|
if "\\\\" in rest and "align=" not in options:
|
|
# Add align=center to the options
|
|
if options.strip():
|
|
new_options = options.rstrip() + ", align=center"
|
|
else:
|
|
new_options = "align=center"
|
|
return f"\\node[{new_options}]{rest}"
|
|
|
|
return full_match
|
|
|
|
# Match \node[options] followed by rest of the line
|
|
# Capture options and the rest separately
|
|
tikz_node_pattern = r"\\node\[([^\]]*)\]([^;]*;)"
|
|
result = re.sub(tikz_node_pattern, fix_tikz_node, result)
|
|
|
|
return result
|
|
|
|
|
|
class DocumentGenerator:
|
|
"""
|
|
Generates academic summary documents in LaTeX format.
|
|
|
|
The system follows these principles:
|
|
1. latex/resumen.md is the SINGLE SOURCE OF TRUTH for prompt structure
|
|
2. Generates full LaTeX documents (not Markdown)
|
|
3. Compiles to PDF using pdflatex
|
|
4. Supports iterative fixing with AI if compilation fails
|
|
5. Supports progress notifications via callback
|
|
"""
|
|
|
|
def __init__(self, notification_callback: Optional[Callable[[str], None]] = None):
|
|
"""
|
|
Initialize DocumentGenerator.
|
|
|
|
Args:
|
|
notification_callback: Optional callback function for progress notifications
|
|
Takes a single string argument (message to send)
|
|
"""
|
|
self.logger = logging.getLogger(__name__)
|
|
self.ai_provider = ai_provider_factory.get_best_provider()
|
|
self.notification_callback = notification_callback
|
|
self.use_parallel = ai_provider_factory.use_parallel()
|
|
self.executor = ThreadPoolExecutor(max_workers=4)
|
|
|
|
# Ensure output directories exist
|
|
settings.LOCAL_DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True)
|
|
settings.LOCAL_DOCX.mkdir(parents=True, exist_ok=True)
|
|
|
|
if self.use_parallel:
|
|
self.logger.info(
|
|
"🚀 Parallel processing enabled: Multiple AI providers available"
|
|
)
|
|
|
|
def _notify(self, message: str) -> None:
|
|
"""Send notification if callback is configured"""
|
|
if self.notification_callback:
|
|
try:
|
|
self.notification_callback(message)
|
|
except Exception as e:
|
|
self.logger.warning(f"Failed to send notification: {e}")
|
|
|
|
def _generate_with_parallel_provider(self, prompt: str, **kwargs) -> str:
|
|
"""
|
|
Generate content using multiple AI providers in parallel.
|
|
|
|
Races multiple providers and returns the first successful response,
|
|
or the best quality response if using consensus strategy.
|
|
"""
|
|
try:
|
|
parallel_provider = ai_provider_factory.get_parallel_provider(max_workers=4)
|
|
self.logger.info("🚀 Using parallel AI provider (race mode)")
|
|
|
|
result = parallel_provider.generate_parallel(
|
|
prompt=prompt,
|
|
strategy="race", # Use first successful response
|
|
timeout_ms=300000, # 5 minutes
|
|
**kwargs,
|
|
)
|
|
|
|
self.logger.info(
|
|
f"✅ Parallel generation complete: {result.selected_provider} selected, "
|
|
f"{result.total_duration_ms}ms"
|
|
)
|
|
|
|
return result.content
|
|
|
|
except Exception as e:
|
|
self.logger.warning(
|
|
f"⚠️ Parallel generation failed: {e}, falling back to single provider"
|
|
)
|
|
return self.ai_provider.generate_text(prompt, **kwargs)
|
|
|
|
def _convert_formats_parallel(
|
|
self, tex_path: Path, pdf_path: Optional[Path], base_name: str
|
|
) -> Optional[Path]:
|
|
"""
|
|
Convert to multiple formats in parallel (DOCX, optionally PDF).
|
|
|
|
If PDF is already compiled, only DOCX is generated.
|
|
Otherwise, both PDF and DOCX are generated in parallel.
|
|
"""
|
|
futures = {}
|
|
|
|
# Generate DOCX
|
|
if shutil.which("pandoc"):
|
|
futures["docx"] = self.executor.submit(
|
|
self._convert_tex_to_docx, tex_path, base_name
|
|
)
|
|
|
|
# Wait for DOCX completion
|
|
docx_path = None
|
|
if "docx" in futures:
|
|
try:
|
|
docx_path = futures["docx"].result(timeout=60)
|
|
if docx_path:
|
|
self.logger.info(f"✅ Parallel DOCX generated: {docx_path}")
|
|
except Exception as e:
|
|
self.logger.warning(f"⚠️ DOCX generation failed: {e}")
|
|
|
|
return docx_path
|
|
|
|
def _upload_to_notion_background(
|
|
self,
|
|
base_name: str,
|
|
summary: str,
|
|
pdf_path: Optional[Path],
|
|
metadata: Dict[str, Any],
|
|
):
|
|
"""Upload to Notion in background thread (non-blocking)."""
|
|
|
|
def upload_worker():
|
|
try:
|
|
from services.notion_service import notion_service
|
|
|
|
title = base_name.replace("_", " ").title()
|
|
notion_metadata = {
|
|
"file_type": "Audio",
|
|
"pdf_path": pdf_path or Path(""),
|
|
"add_status": False,
|
|
"use_as_page": False,
|
|
}
|
|
|
|
page_id = notion_service.create_page_with_summary(
|
|
title=title, summary=summary, metadata=notion_metadata
|
|
)
|
|
|
|
if page_id:
|
|
metadata["notion_uploaded"] = True
|
|
metadata["notion_page_id"] = page_id
|
|
self.logger.info(
|
|
f"✅ Background upload to Notion complete: {title}"
|
|
)
|
|
else:
|
|
self.logger.warning(f"⚠️ Background Notion upload failed: {title}")
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"❌ Background Notion upload error: {e}")
|
|
|
|
# Start background thread
|
|
thread = threading.Thread(target=upload_worker, daemon=True)
|
|
thread.start()
|
|
self.logger.info("🔄 Notion upload started in background")
|
|
|
|
def generate_summary(
|
|
self,
|
|
text: str,
|
|
base_name: str,
|
|
materia: str = "Economía",
|
|
bibliographic_text: Optional[str] = None,
|
|
class_number: Optional[int] = None,
|
|
) -> Tuple[bool, str, Dict[str, Any]]:
|
|
"""
|
|
Generate comprehensive academic summary in LaTeX format.
|
|
|
|
Args:
|
|
text: The class transcription text
|
|
base_name: Base filename for output files
|
|
materia: Subject name (default: "Economía")
|
|
bibliographic_text: Optional supporting material from books/notes
|
|
class_number: Optional class number for header
|
|
|
|
Returns:
|
|
Tuple of (success, summary_text, metadata)
|
|
"""
|
|
self.logger.info(
|
|
f"🚀 Starting LaTeX academic summary generation for: {base_name}"
|
|
)
|
|
|
|
metadata = {
|
|
"filename": base_name,
|
|
"tex_path": "",
|
|
"pdf_path": "",
|
|
"markdown_path": "",
|
|
"docx_path": "",
|
|
"summary_snippet": "",
|
|
"notion_uploaded": False,
|
|
"notion_page_id": None,
|
|
"materia": materia,
|
|
}
|
|
|
|
try:
|
|
# === STEP 1: Generate LaTeX content using AI ===
|
|
self.logger.info(
|
|
"🧠 Sending request to AI Provider for LaTeX generation..."
|
|
)
|
|
self._notify("📝 Preparando prompt de resumen académico...")
|
|
|
|
prompt = prompt_manager.get_latex_summary_prompt(
|
|
transcription=text,
|
|
materia=materia,
|
|
bibliographic_text=bibliographic_text,
|
|
class_number=class_number,
|
|
)
|
|
|
|
self._notify(
|
|
"🧠 Enviando solicitud a la IA (esto puede tardar unos minutos)..."
|
|
)
|
|
|
|
# Use parallel provider if multiple AI providers are available
|
|
if self.use_parallel:
|
|
raw_response = self._generate_with_parallel_provider(prompt)
|
|
else:
|
|
raw_response = self.ai_provider.generate_text(prompt)
|
|
|
|
if not raw_response:
|
|
raise FileProcessingError("AI returned empty response")
|
|
|
|
self.logger.info(f"📝 AI response received: {len(raw_response)} characters")
|
|
self._notify(f"✅ Respuesta recibida ({len(raw_response)} caracteres)")
|
|
|
|
# === STEP 2: Extract clean LaTeX from AI response ===
|
|
self._notify("🔍 Extrayendo código LaTeX...")
|
|
|
|
latex_content = prompt_manager.extract_latex_from_response(raw_response)
|
|
|
|
if not latex_content:
|
|
self.logger.warning(
|
|
"⚠️ No valid LaTeX found in response, treating as Markdown"
|
|
)
|
|
self._notify("⚠️ No se detectó LaTeX válido, usando modo compatible...")
|
|
# Fallback to Markdown processing
|
|
return self._fallback_to_markdown(raw_response, base_name, metadata)
|
|
|
|
self.logger.info("✨ Valid LaTeX content detected")
|
|
self._notify(f"✨ LaTeX detectado: {len(latex_content)} caracteres")
|
|
|
|
# === STEP 3: Compilation Loop with Self-Correction ===
|
|
max_retries = 3
|
|
current_latex = latex_content
|
|
|
|
for attempt in range(max_retries + 1):
|
|
# Sanitize LaTeX before saving (fix common AI errors like TikZ nodes)
|
|
current_latex = _sanitize_latex(current_latex)
|
|
|
|
# Save current .tex file
|
|
self._notify(
|
|
f"📄 Guardando archivo .tex (intento {attempt + 1}/{max_retries + 1})..."
|
|
)
|
|
|
|
tex_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.tex"
|
|
tex_path.write_text(current_latex, encoding="utf-8")
|
|
metadata["tex_path"] = str(tex_path)
|
|
|
|
# Try to compile
|
|
self._notify("⚙️ Primera pasada de compilación LaTeX...")
|
|
|
|
pdf_path = self._compile_latex(
|
|
tex_path, output_dir=settings.LOCAL_DOWNLOADS_PATH
|
|
)
|
|
|
|
if pdf_path:
|
|
self.logger.info(
|
|
f"✅ Compilation success on attempt {attempt + 1}!"
|
|
)
|
|
self._notify("✅ PDF generado exitosamente!")
|
|
metadata["pdf_path"] = str(pdf_path)
|
|
|
|
# Generate DOCX in parallel
|
|
self._notify("📄 Generando archivo DOCX en paralelo...")
|
|
docx_path = self._convert_formats_parallel(
|
|
tex_path, pdf_path, base_name
|
|
)
|
|
if docx_path:
|
|
self._notify("✅ DOCX generado exitosamente!")
|
|
metadata["docx_path"] = str(docx_path)
|
|
|
|
# Create a text summary for Notion/preview
|
|
text_summary = self._create_text_summary(current_latex)
|
|
metadata["summary_snippet"] = text_summary[:500] + "..."
|
|
|
|
# Upload to Notion in background if configured
|
|
if settings.has_notion_config:
|
|
self._notify("📤 Iniciando carga a Notion en segundo plano...")
|
|
self._upload_to_notion_background(
|
|
base_name=base_name,
|
|
summary=text_summary,
|
|
pdf_path=pdf_path,
|
|
metadata=metadata,
|
|
)
|
|
|
|
self._notify("🎉 ¡Resumen completado con éxito!")
|
|
return True, text_summary, metadata
|
|
|
|
# Compilation failed - ask AI to fix
|
|
if attempt < max_retries:
|
|
self.logger.warning(
|
|
f"⚠️ Compilation failed (Attempt {attempt + 1}/{max_retries + 1}). "
|
|
f"Requesting AI fix..."
|
|
)
|
|
self._notify(
|
|
f"⚠️ Error de compilación ({attempt + 1}/{max_retries + 1}), solicitando corrección a IA..."
|
|
)
|
|
|
|
# Get error log
|
|
log_file = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.log"
|
|
error_log = "Log file not found"
|
|
if log_file.exists():
|
|
error_log = log_file.read_text(
|
|
encoding="utf-8", errors="ignore"
|
|
)[-2000:]
|
|
|
|
# Ask AI to fix
|
|
try:
|
|
self._notify("🔧 La IA está corrigiendo el código LaTeX...")
|
|
if hasattr(self.ai_provider, "fix_latex"):
|
|
fixed_latex = self.ai_provider.fix_latex(
|
|
current_latex, error_log
|
|
)
|
|
cleaned = prompt_manager.extract_latex_from_response(
|
|
fixed_latex
|
|
)
|
|
if cleaned:
|
|
current_latex = cleaned
|
|
else:
|
|
current_latex = fixed_latex
|
|
self._notify(
|
|
"✅ Código LaTeX corregido, reintentando compilación..."
|
|
)
|
|
else:
|
|
self.logger.error(
|
|
"❌ AI provider doesn't support fix_latex()"
|
|
)
|
|
break
|
|
except Exception as e:
|
|
self.logger.error(f"❌ AI fix request failed: {e}")
|
|
break
|
|
else:
|
|
self.logger.error(
|
|
"❌ Max retries reached. LaTeX compilation failed."
|
|
)
|
|
self._notify(
|
|
"❌ No se pudo compilar el LaTeX después de varios intentos"
|
|
)
|
|
|
|
# If we get here, all compilation attempts failed
|
|
self._notify("⚠️ Usando modo de compatibilidad Markdown...")
|
|
return self._fallback_to_markdown(
|
|
current_latex or raw_response, base_name, metadata
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(
|
|
f"❌ Critical error in document generation: {e}", exc_info=True
|
|
)
|
|
self._notify(f"❌ Error en la generación: {str(e)[:100]}")
|
|
return False, "", metadata
|
|
|
|
def _compile_latex(self, tex_path: Path, output_dir: Path) -> Optional[Path]:
|
|
"""
|
|
Compile LaTeX to PDF using pdflatex. Runs twice for TOC.
|
|
|
|
Args:
|
|
tex_path: Path to .tex file
|
|
output_dir: Directory for output files
|
|
|
|
Returns:
|
|
Path to generated PDF or None if failed
|
|
"""
|
|
base_name = tex_path.stem
|
|
expected_pdf = output_dir / f"{base_name}.pdf"
|
|
|
|
# Check if pdflatex is available
|
|
if not shutil.which("pdflatex"):
|
|
self.logger.error("🚫 pdflatex not found in system PATH")
|
|
return None
|
|
|
|
cmd = [
|
|
"pdflatex",
|
|
"-interaction=nonstopmode",
|
|
"-halt-on-error",
|
|
f"-output-directory={output_dir}",
|
|
str(tex_path),
|
|
]
|
|
|
|
try:
|
|
# Pass 1
|
|
self.logger.info("⚙️ Compiling LaTeX (Pass 1/2)...")
|
|
subprocess.run(
|
|
cmd,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
check=False,
|
|
timeout=120,
|
|
)
|
|
|
|
# Pass 2 (for TOC resolution)
|
|
self.logger.info("⚙️ Compiling LaTeX (Pass 2/2)...")
|
|
result = subprocess.run(
|
|
cmd,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
check=False,
|
|
timeout=120,
|
|
)
|
|
|
|
if result.returncode == 0 and expected_pdf.exists():
|
|
self.logger.info(f"✅ PDF generated: {expected_pdf}")
|
|
self._cleanup_latex_aux(output_dir, base_name)
|
|
return expected_pdf
|
|
else:
|
|
# Read log file for error info
|
|
log_file = output_dir / f"{base_name}.log"
|
|
error_snippet = "Unknown error"
|
|
if log_file.exists():
|
|
try:
|
|
log_content = log_file.read_text(
|
|
encoding="utf-8", errors="ignore"
|
|
)
|
|
errors = [
|
|
line
|
|
for line in log_content.splitlines()
|
|
if line.startswith("!")
|
|
]
|
|
if errors:
|
|
error_snippet = errors[0][:200]
|
|
except:
|
|
pass
|
|
|
|
self.logger.error(f"❌ LaTeX compilation failed: {error_snippet}")
|
|
return None
|
|
|
|
except subprocess.TimeoutExpired:
|
|
self.logger.error("❌ LaTeX compilation timed out")
|
|
return None
|
|
except Exception as e:
|
|
self.logger.error(f"❌ Error during LaTeX execution: {e}")
|
|
return None
|
|
|
|
def _convert_tex_to_docx(self, tex_path: Path, base_name: str) -> Optional[Path]:
|
|
"""Convert .tex to .docx using Pandoc."""
|
|
if not shutil.which("pandoc"):
|
|
self.logger.warning("⚠️ pandoc not found, skipping DOCX generation")
|
|
return None
|
|
|
|
docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
|
|
cmd = ["pandoc", str(tex_path), "-o", str(docx_path)]
|
|
|
|
try:
|
|
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
|
self.logger.info(f"✅ DOCX generated: {docx_path}")
|
|
return docx_path
|
|
except Exception as e:
|
|
self.logger.warning(f"⚠️ DOCX generation failed: {e}")
|
|
return None
|
|
|
|
def _create_text_summary(self, latex_content: str) -> str:
|
|
"""Extract a plain text summary from LaTeX content for Notion/preview."""
|
|
# Remove LaTeX commands and keep content
|
|
text = latex_content
|
|
|
|
# Remove document class and packages
|
|
text = re.sub(r"\\documentclass\[?[^\]]*\]?\{[^\}]+\}", "", text)
|
|
text = re.sub(r"\\usepackage\{[^\}]+\}", "", text)
|
|
text = re.sub(r"\\geometry\{[^\}]+\}", "", text)
|
|
text = re.sub(r"\\pagestyle\{[^\}]+\}", "", text)
|
|
text = re.sub(r"\\fancyhf\{\}", "", text)
|
|
text = re.sub(r"\\fancyhead\[?[^\]]*\]?\{[^\}]+\}", "", text)
|
|
text = re.sub(r"\\fancyfoot\[?[^\]]*\]?\{[^\}]+\}", "", text)
|
|
|
|
# Convert sections to markdown-style
|
|
text = re.sub(r"\\section\*?\{([^\}]+)\}", r"# \1", text)
|
|
text = re.sub(r"\\subsection\*?\{([^\}]+)\}", r"## \1", text)
|
|
text = re.sub(r"\\subsubsection\*?\{([^\}]+)\}", r"### \1", text)
|
|
|
|
# Remove tcolorbox environments (keep content)
|
|
text = re.sub(
|
|
r"\\begin\{(definicion|importante|ejemplo)\}\[?[^\]]*\]?",
|
|
r"\n**\1:** ",
|
|
text,
|
|
)
|
|
text = re.sub(r"\\end\{(definicion|importante|ejemplo)\}", "", text)
|
|
|
|
# Convert itemize to bullets
|
|
text = re.sub(r"\\item\s*", "- ", text)
|
|
text = re.sub(r"\\begin\{(itemize|enumerate)\}", "", text)
|
|
text = re.sub(r"\\end\{(itemize|enumerate)\}", "", text)
|
|
|
|
# Clean up math (basic)
|
|
text = re.sub(r"\$\$([^\$]+)\$\$", r"\n\n\1\n\n", text)
|
|
text = re.sub(r"\$([^\$]+)\$", r"\1", text)
|
|
|
|
# Remove remaining LaTeX commands
|
|
text = re.sub(r"\\[a-zA-Z]+(\{[^\}]*\})*", "", text)
|
|
text = re.sub(r"[{}]", "", text)
|
|
|
|
# Clean whitespace
|
|
text = re.sub(r"\n\s*\n\s*\n", "\n\n", text)
|
|
text = text.strip()
|
|
|
|
return text
|
|
|
|
def _fallback_to_markdown(
|
|
self, content: str, base_name: str, metadata: Dict[str, Any]
|
|
) -> Tuple[bool, str, Dict[str, Any]]:
|
|
"""Fallback when LaTeX generation fails."""
|
|
self.logger.warning("⚠️ Falling back to Markdown processing")
|
|
|
|
md_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_resumen.md"
|
|
md_path.write_text(content, encoding="utf-8")
|
|
metadata["markdown_path"] = str(md_path)
|
|
|
|
# Try to convert to PDF via pandoc
|
|
if shutil.which("pandoc"):
|
|
pdf_path = self._convert_md_to_pdf(md_path, base_name)
|
|
if pdf_path:
|
|
metadata["pdf_path"] = str(pdf_path)
|
|
|
|
docx_path = self._convert_md_to_docx(md_path, base_name)
|
|
if docx_path:
|
|
metadata["docx_path"] = str(docx_path)
|
|
|
|
metadata["summary_snippet"] = content[:500] + "..."
|
|
return True, content, metadata
|
|
|
|
def _convert_md_to_pdf(self, md_path: Path, base_name: str) -> Optional[Path]:
|
|
"""Convert Markdown to PDF using pandoc."""
|
|
pdf_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.pdf"
|
|
cmd = [
|
|
"pandoc",
|
|
str(md_path),
|
|
"-o",
|
|
str(pdf_path),
|
|
"--pdf-engine=pdflatex",
|
|
"-V",
|
|
"geometry:margin=2.5cm",
|
|
]
|
|
|
|
try:
|
|
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
|
self.logger.info(f"✅ PDF from Markdown: {pdf_path}")
|
|
return pdf_path
|
|
except Exception as e:
|
|
self.logger.warning(f"⚠️ PDF from Markdown failed: {e}")
|
|
return None
|
|
|
|
def _convert_md_to_docx(self, md_path: Path, base_name: str) -> Optional[Path]:
|
|
"""Convert Markdown to DOCX using pandoc."""
|
|
docx_path = settings.LOCAL_DOCX / f"{base_name}.docx"
|
|
cmd = ["pandoc", str(md_path), "-o", str(docx_path)]
|
|
|
|
try:
|
|
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
|
self.logger.info(f"✅ DOCX from Markdown: {docx_path}")
|
|
return docx_path
|
|
except Exception as e:
|
|
self.logger.warning(f"⚠️ DOCX from Markdown failed: {e}")
|
|
return None
|
|
|
|
def _cleanup_latex_aux(self, output_dir: Path, base_name: str):
|
|
"""Clean up auxiliary LaTeX files."""
|
|
extensions = [".aux", ".log", ".out", ".toc"]
|
|
for ext in extensions:
|
|
aux_file = output_dir / f"{base_name}{ext}"
|
|
if aux_file.exists():
|
|
try:
|
|
aux_file.unlink()
|
|
except:
|
|
pass
|
|
|
|
def _upload_to_notion(
|
|
self,
|
|
base_name: str,
|
|
summary: str,
|
|
pdf_path: Optional[Path],
|
|
metadata: Dict[str, Any],
|
|
):
|
|
"""Upload summary to Notion if configured."""
|
|
try:
|
|
from services.notion_service import notion_service
|
|
|
|
title = base_name.replace("_", " ").title()
|
|
notion_metadata = {
|
|
"file_type": "Audio",
|
|
"pdf_path": pdf_path or Path(""),
|
|
"add_status": False,
|
|
"use_as_page": False,
|
|
}
|
|
|
|
page_id = notion_service.create_page_with_summary(
|
|
title=title, summary=summary, metadata=notion_metadata
|
|
)
|
|
|
|
if page_id:
|
|
metadata["notion_uploaded"] = True
|
|
metadata["notion_page_id"] = page_id
|
|
self.logger.info(f"✅ Uploaded to Notion: {title}")
|
|
else:
|
|
self.logger.warning(f"⚠️ Notion upload failed: {title}")
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"❌ Notion upload error: {e}")
|