Added summarize_el_principe.py script with: - Text processing and chunking functionality - OpenAI API integration for summarization - JSON chunk summaries storage - PDF report generation - Configuration management via .env file Includes: - Source text "el principe.txt" - Generated summary "resumen_el_principe.md" - PDF output "resumen_el_principe.pdf" - Chunk summaries "chunk_summaries.json" 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
458 lines
14 KiB
Python
458 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate a multi-step summary of `el principe.txt` using Gemini, writing both
|
|
Markdown and PDF outputs. The script chunks the source text to stay within API
|
|
limits, requests partial summaries, and then asks the model for a polished,
|
|
styled Markdown document that we later convert to PDF.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import time
|
|
import unicodedata
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple, cast
|
|
|
|
from google import genai
|
|
from google.genai import errors as genai_errors
|
|
from google.genai import types as genai_types
|
|
|
|
TEXT_PATH = Path("el principe.txt")
|
|
MD_OUTPUT = Path("resumen_el_principe.md")
|
|
PDF_OUTPUT = Path("resumen_el_principe.pdf")
|
|
CHUNK_CACHE_PATH = Path("chunk_summaries.json")
|
|
|
|
PAUSE_SECONDS = 3.0 # small pause between successful calls
|
|
RATE_LIMIT_BACKOFF_SECONDS = 20.0 # wait time (per attempt) after 429s
|
|
MAX_RETRIES = 6
|
|
|
|
DEFAULT_MODEL = "gemini-2.0-flash"
|
|
|
|
TARGET_PAGES = 26 # número de capítulos esperados
|
|
|
|
|
|
@dataclass
|
|
class Chapter:
|
|
index: int
|
|
title: str
|
|
body: str
|
|
hash: str
|
|
|
|
|
|
TIMESTAMP_RE = re.compile(r"^\s*\[\d{2}:\d{2}:\d{2}\]\s*")
|
|
|
|
|
|
def strip_timestamp(line: str) -> str:
|
|
return TIMESTAMP_RE.sub("", line, count=1)
|
|
|
|
|
|
ROMAN_VALUES = {
|
|
"I": 1,
|
|
"V": 5,
|
|
"X": 10,
|
|
"L": 50,
|
|
"C": 100,
|
|
"D": 500,
|
|
"M": 1000,
|
|
}
|
|
|
|
|
|
def roman_to_int(value: str) -> Optional[int]:
|
|
value = value.upper()
|
|
total = 0
|
|
prev = 0
|
|
for char in reversed(value):
|
|
if char not in ROMAN_VALUES:
|
|
return None
|
|
current = ROMAN_VALUES[char]
|
|
if current < prev:
|
|
total -= current
|
|
else:
|
|
total += current
|
|
prev = current
|
|
return total
|
|
|
|
|
|
def detect_chapter_heading(line: str) -> Optional[Tuple[int, str]]:
|
|
clean = strip_timestamp(line).strip()
|
|
if not clean:
|
|
return None
|
|
|
|
match = re.match(
|
|
r"(?i)cap[ií]tulo\s+([ivxlcdm]+|\d+)(?:\s*[\.\-:]?\s*(.*))?",
|
|
clean,
|
|
)
|
|
if match:
|
|
raw_label = match.group(1)
|
|
rest = (match.group(2) or "").strip(" .:-")
|
|
number = (
|
|
int(raw_label)
|
|
if raw_label.isdigit()
|
|
else roman_to_int(raw_label)
|
|
)
|
|
if number:
|
|
title = rest or f"Capítulo {number}"
|
|
return number, title
|
|
|
|
match = re.match(r"(\d{1,2})\.\s*([A-ZÁÉÍÓÚ].*)", clean)
|
|
if match:
|
|
number = int(match.group(1))
|
|
title = match.group(2).strip(" .:-") or f"Capítulo {number}"
|
|
return number, title
|
|
|
|
match = re.match(r"(?i)(?:puesto|p)\.?\s*(\d{1,2})\.\s*(.*)", clean)
|
|
if match:
|
|
number = int(match.group(1))
|
|
title = match.group(2).strip(" .:-") or f"Capítulo {number}"
|
|
return number, title
|
|
|
|
return None
|
|
|
|
|
|
def split_into_chapters(text: str) -> List[Chapter]:
|
|
lines = text.splitlines()
|
|
chapters: List[Chapter] = []
|
|
current_number: Optional[int] = None
|
|
current_title: str = ""
|
|
current_lines: List[str] = []
|
|
|
|
for line in lines:
|
|
heading = detect_chapter_heading(line)
|
|
if heading:
|
|
number, title = heading
|
|
if current_number is not None and current_lines:
|
|
body = "\n".join(current_lines).strip()
|
|
if body:
|
|
chapters.append(
|
|
Chapter(
|
|
index=current_number,
|
|
title=current_title or f"Capítulo {current_number}",
|
|
body=body,
|
|
hash=hashlib.sha1(body.encode("utf-8")).hexdigest(),
|
|
)
|
|
)
|
|
current_number = number
|
|
current_title = title
|
|
current_lines = []
|
|
elif current_number is not None:
|
|
current_lines.append(strip_timestamp(line))
|
|
|
|
if current_number is not None and current_lines:
|
|
body = "\n".join(current_lines).strip()
|
|
chapters.append(
|
|
Chapter(
|
|
index=current_number,
|
|
title=current_title or f"Capítulo {current_number}",
|
|
body=body,
|
|
hash=hashlib.sha1(body.encode("utf-8")).hexdigest(),
|
|
)
|
|
)
|
|
|
|
chapters.sort(key=lambda ch: ch.index)
|
|
return chapters
|
|
|
|
|
|
class GeminiError(RuntimeError):
|
|
"""Raised when the Gemini API does not return usable text."""
|
|
|
|
|
|
def load_api_key() -> str:
|
|
api_key = os.getenv("GEMINI_API_KEY")
|
|
if api_key:
|
|
return api_key
|
|
|
|
env_path = Path(".env")
|
|
if env_path.exists():
|
|
for line in env_path.read_text().splitlines():
|
|
if line.strip().startswith("GEMINI_API_KEY"):
|
|
_, _, raw_value = line.partition("=")
|
|
value = raw_value.strip().strip("\"' ")
|
|
if value:
|
|
os.environ["GEMINI_API_KEY"] = value
|
|
return value
|
|
|
|
raise RuntimeError(
|
|
"La variable de entorno GEMINI_API_KEY no está configurada. "
|
|
"Define la clave en el entorno o en .env."
|
|
)
|
|
|
|
|
|
def call_gemini(
|
|
client: genai.Client,
|
|
model: str,
|
|
prompt: str,
|
|
*,
|
|
temperature: float = 0.4,
|
|
top_p: float = 0.95,
|
|
top_k: int = 32,
|
|
max_output_tokens: int = 1536,
|
|
) -> str:
|
|
"""Invoke the Gemini API with retries and return the generated text."""
|
|
config = genai_types.GenerateContentConfig(
|
|
temperature=temperature,
|
|
top_p=top_p,
|
|
top_k=top_k,
|
|
max_output_tokens=max_output_tokens,
|
|
response_mime_type="text/plain",
|
|
response_modalities=["TEXT"],
|
|
)
|
|
|
|
for attempt in range(1, MAX_RETRIES + 1):
|
|
try:
|
|
response = client.models.generate_content(
|
|
model=model,
|
|
contents=prompt,
|
|
config=config,
|
|
)
|
|
except genai_errors.ClientError as exc:
|
|
wait_time: Optional[float] = None
|
|
if exc.code == 429:
|
|
wait_time = RATE_LIMIT_BACKOFF_SECONDS * attempt
|
|
print(
|
|
f"Advertencia: respuesta 429 (intento {attempt}/{MAX_RETRIES}): "
|
|
f"{exc.message}"
|
|
)
|
|
elif exc.code and exc.code >= 500:
|
|
wait_time = PAUSE_SECONDS * attempt
|
|
print(
|
|
f"Advertencia: error {exc.code} del modelo "
|
|
f"(intento {attempt}/{MAX_RETRIES}): {exc.message}"
|
|
)
|
|
else:
|
|
raise GeminiError(
|
|
f"Error {exc.code or 'desconocido'} llamando al modelo: {exc}"
|
|
) from exc
|
|
|
|
if wait_time is not None and attempt < MAX_RETRIES:
|
|
time.sleep(wait_time)
|
|
continue
|
|
raise GeminiError(
|
|
f"Falló la llamada al modelo tras reintentos: {exc}"
|
|
) from exc
|
|
except genai_errors.ServerError as exc:
|
|
wait_time = PAUSE_SECONDS * attempt
|
|
print(
|
|
f"Advertencia: error {exc.code} del servidor "
|
|
f"(intento {attempt}/{MAX_RETRIES}): {exc.message}"
|
|
)
|
|
if attempt < MAX_RETRIES:
|
|
time.sleep(wait_time)
|
|
continue
|
|
raise GeminiError(
|
|
f"Falló la llamada al modelo tras reintentos: {exc}"
|
|
) from exc
|
|
|
|
text = (response.text or "").strip()
|
|
if text:
|
|
return text
|
|
|
|
parts = []
|
|
for candidate in response.candidates or []:
|
|
for part in candidate.content.parts:
|
|
value = getattr(part, "text", None)
|
|
if value:
|
|
parts.append(value)
|
|
if parts:
|
|
return "\n".join(parts).strip()
|
|
|
|
raise GeminiError("La respuesta del modelo no contenía texto.")
|
|
|
|
raise GeminiError("Se agotaron los reintentos con la API de Gemini.")
|
|
|
|
|
|
def build_chunk_prompt(chapter: Chapter, idx: int, total: int) -> str:
|
|
return (
|
|
"Eres un historiador y estratega político. Necesito un dossier extenso, "
|
|
"rico en matices, que pueda ocupar aproximadamente una página impresa "
|
|
"para cada capítulo de \"El príncipe\". Mantén un tono analítico, claro y "
|
|
"elegante en español. Sigue esta estructura en cada respuesta:\n\n"
|
|
"#### Panorama narrativo\n"
|
|
"- Explica en 3-4 frases qué sucede en el capítulo y por qué importa.\n\n"
|
|
"#### Tácticas y lecciones\n"
|
|
"- Enumera 3-4 tácticas o ideas centrales con ejemplos del texto.\n\n"
|
|
"#### Recursos memorables\n"
|
|
"- Cita o parafrasea dos pasajes breves y comenta su significado.\n\n"
|
|
"#### Resonancias actuales\n"
|
|
"- Conecta las ideas con situaciones de liderazgo contemporáneo.\n\n"
|
|
f"Capítulo {chapter.index}/{total}: {chapter.title}\n"
|
|
"Transcripción base:\n"
|
|
f"\"\"\"\n{chapter.body}\n\"\"\""
|
|
)
|
|
|
|
|
|
def slugify_anchor(text: str) -> str:
|
|
normalized = unicodedata.normalize("NFD", text)
|
|
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
|
|
slug = re.sub(r"[^a-z0-9]+", "-", ascii_text.lower()).strip("-")
|
|
return slug or "capitulo"
|
|
|
|
|
|
def compose_markdown(chapters: List[Chapter], summaries: List[str]) -> str:
|
|
toc_lines: List[str] = []
|
|
for chapter in chapters:
|
|
anchor_text = f"Capítulo {chapter.index}: {chapter.title}"
|
|
anchor = slugify_anchor(anchor_text)
|
|
toc_lines.append(
|
|
f"{chapter.index}. [{anchor_text}](#{anchor})"
|
|
)
|
|
|
|
lines = [
|
|
'# Dossier capítulo a capítulo de "El príncipe"',
|
|
"> \"Es mejor ser temido que amado, si no se pueden tener ambas cosas.\"",
|
|
"---",
|
|
"## Tabla de contenidos",
|
|
"\n".join(toc_lines),
|
|
"",
|
|
]
|
|
|
|
for chapter, summary in zip(chapters, summaries):
|
|
anchor_text = f"Capítulo {chapter.index}: {chapter.title}"
|
|
anchor = slugify_anchor(anchor_text)
|
|
lines.extend(
|
|
[
|
|
f"<a name=\"{anchor}\"></a>",
|
|
f"### {anchor_text}",
|
|
summary.strip(),
|
|
"",
|
|
]
|
|
)
|
|
|
|
return "\n".join(lines).strip()
|
|
|
|
|
|
def current_text_signature() -> Dict[str, float]:
|
|
stats = TEXT_PATH.stat()
|
|
return {"size": stats.st_size, "mtime": stats.st_mtime}
|
|
|
|
|
|
def load_chunk_cache(
|
|
signature: Dict[str, float],
|
|
chapters: List[Chapter],
|
|
) -> Dict[int, str]:
|
|
if not CHUNK_CACHE_PATH.exists():
|
|
return {}
|
|
try:
|
|
data = json.loads(CHUNK_CACHE_PATH.read_text(encoding="utf-8"))
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
if data.get("text_signature") != signature:
|
|
return {}
|
|
|
|
chapter_map = {chapter.index: chapter for chapter in chapters}
|
|
cache: Dict[int, str] = {}
|
|
for entry in data.get("chapters", []):
|
|
idx = entry.get("index")
|
|
summary = entry.get("summary")
|
|
stored_hash = entry.get("hash")
|
|
chapter = chapter_map.get(idx)
|
|
if (
|
|
chapter
|
|
and isinstance(summary, str)
|
|
and stored_hash == chapter.hash
|
|
):
|
|
cache[idx] = summary
|
|
return cache
|
|
|
|
|
|
def persist_chunk_cache(
|
|
signature: Dict[str, float],
|
|
chapters: List[Chapter],
|
|
summaries: List[Optional[str]],
|
|
) -> None:
|
|
entries = [
|
|
{
|
|
"index": chapter.index,
|
|
"title": chapter.title,
|
|
"hash": chapter.hash,
|
|
"summary": summary,
|
|
}
|
|
for chapter, summary in zip(chapters, summaries)
|
|
if summary is not None
|
|
]
|
|
payload = {
|
|
"text_signature": signature,
|
|
"chapters": entries,
|
|
}
|
|
CHUNK_CACHE_PATH.write_text(
|
|
json.dumps(payload, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
def generate_summary() -> str:
|
|
api_key = load_api_key()
|
|
model = os.getenv("GEMINI_MODEL", DEFAULT_MODEL)
|
|
client = genai.Client(api_key=api_key)
|
|
text = TEXT_PATH.read_text(encoding="utf-8")
|
|
chapters = split_into_chapters(text)
|
|
if not chapters:
|
|
raise GeminiError("No se pudieron detectar capítulos en el texto.")
|
|
if len(chapters) != TARGET_PAGES:
|
|
print(
|
|
f"Advertencia: se detectaron {len(chapters)} capítulos, "
|
|
f"pero se esperaban {TARGET_PAGES}."
|
|
)
|
|
|
|
signature = current_text_signature()
|
|
cached = load_chunk_cache(signature, chapters)
|
|
|
|
chapter_summaries: List[Optional[str]] = [None] * len(chapters)
|
|
for pos, chapter in enumerate(chapters, start=1):
|
|
cached_summary = cached.get(chapter.index)
|
|
if cached_summary:
|
|
chapter_summaries[pos - 1] = cached_summary
|
|
|
|
total = len(chapters)
|
|
print(f"Procesando {total} capítulos con el modelo {model}...")
|
|
|
|
for idx, chapter in enumerate(chapters, start=1):
|
|
if chapter_summaries[idx - 1]:
|
|
print(f"- Capítulo {chapter.index}/{total} restaurado desde cache.")
|
|
continue
|
|
|
|
prompt = build_chunk_prompt(chapter, idx, total)
|
|
summary = call_gemini(client, model, prompt, max_output_tokens=1536)
|
|
chapter_summaries[idx - 1] = summary
|
|
persist_chunk_cache(signature, chapters, chapter_summaries)
|
|
print(
|
|
f"- Capítulo {chapter.index}/{total} sintetizado "
|
|
f"({len(chapter.body)} caracteres)."
|
|
)
|
|
time.sleep(PAUSE_SECONDS)
|
|
|
|
completed_summaries = cast(
|
|
List[str],
|
|
[s for s in chapter_summaries if s is not None],
|
|
)
|
|
if len(completed_summaries) != total:
|
|
raise GeminiError("No se completaron todos los fragmentos.")
|
|
|
|
final_markdown = compose_markdown(chapters, completed_summaries)
|
|
MD_OUTPUT.write_text(final_markdown.strip() + "\n", encoding="utf-8")
|
|
return final_markdown
|
|
|
|
|
|
def main() -> None:
|
|
final_markdown = generate_summary()
|
|
print(f"\nResumen Markdown guardado en: {MD_OUTPUT}")
|
|
|
|
PDF_OUTPUT.unlink(missing_ok=True) # remove archivo previo si existe
|
|
command = (
|
|
f"pandoc \"{MD_OUTPUT}\" --standalone --pdf-engine=xelatex "
|
|
f"-o \"{PDF_OUTPUT}\""
|
|
)
|
|
result = os.system(command)
|
|
if result != 0:
|
|
raise RuntimeError("pandoc no pudo crear el PDF. Revisa que LaTeX esté instalado.")
|
|
|
|
print(f"PDF generado en: {PDF_OUTPUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|