Initial commit: Python script for summarizing "El Principe"

Added summarize_el_principe.py script with:
- Text processing and chunking functionality
- OpenAI API integration for summarization
- JSON chunk summaries storage
- PDF report generation
- Configuration management via .env file

Includes:
- Source text "el principe.txt"
- Generated summary "resumen_el_principe.md"
- PDF output "resumen_el_principe.pdf"
- Chunk summaries "chunk_summaries.json"

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
renato97
2025-11-10 21:31:00 +00:00
commit 2ab4887847
6 changed files with 3235 additions and 0 deletions

457
summarize_el_principe.py Normal file
View File

@@ -0,0 +1,457 @@
#!/usr/bin/env python3
"""
Generate a multi-step summary of `el principe.txt` using Gemini, writing both
Markdown and PDF outputs. The script chunks the source text to stay within API
limits, requests partial summaries, and then asks the model for a polished,
styled Markdown document that we later convert to PDF.
"""
from __future__ import annotations
import hashlib
import json
import os
import re
import time
import unicodedata
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, cast
from google import genai
from google.genai import errors as genai_errors
from google.genai import types as genai_types
TEXT_PATH = Path("el principe.txt")
MD_OUTPUT = Path("resumen_el_principe.md")
PDF_OUTPUT = Path("resumen_el_principe.pdf")
CHUNK_CACHE_PATH = Path("chunk_summaries.json")
PAUSE_SECONDS = 3.0 # small pause between successful calls
RATE_LIMIT_BACKOFF_SECONDS = 20.0 # wait time (per attempt) after 429s
MAX_RETRIES = 6
DEFAULT_MODEL = "gemini-2.0-flash"
TARGET_PAGES = 26 # número de capítulos esperados
@dataclass
class Chapter:
index: int
title: str
body: str
hash: str
TIMESTAMP_RE = re.compile(r"^\s*\[\d{2}:\d{2}:\d{2}\]\s*")
def strip_timestamp(line: str) -> str:
return TIMESTAMP_RE.sub("", line, count=1)
ROMAN_VALUES = {
"I": 1,
"V": 5,
"X": 10,
"L": 50,
"C": 100,
"D": 500,
"M": 1000,
}
def roman_to_int(value: str) -> Optional[int]:
value = value.upper()
total = 0
prev = 0
for char in reversed(value):
if char not in ROMAN_VALUES:
return None
current = ROMAN_VALUES[char]
if current < prev:
total -= current
else:
total += current
prev = current
return total
def detect_chapter_heading(line: str) -> Optional[Tuple[int, str]]:
clean = strip_timestamp(line).strip()
if not clean:
return None
match = re.match(
r"(?i)cap[ií]tulo\s+([ivxlcdm]+|\d+)(?:\s*[\.\-:]?\s*(.*))?",
clean,
)
if match:
raw_label = match.group(1)
rest = (match.group(2) or "").strip(" .:-")
number = (
int(raw_label)
if raw_label.isdigit()
else roman_to_int(raw_label)
)
if number:
title = rest or f"Capítulo {number}"
return number, title
match = re.match(r"(\d{1,2})\.\s*([A-ZÁÉÍÓÚ].*)", clean)
if match:
number = int(match.group(1))
title = match.group(2).strip(" .:-") or f"Capítulo {number}"
return number, title
match = re.match(r"(?i)(?:puesto|p)\.?\s*(\d{1,2})\.\s*(.*)", clean)
if match:
number = int(match.group(1))
title = match.group(2).strip(" .:-") or f"Capítulo {number}"
return number, title
return None
def split_into_chapters(text: str) -> List[Chapter]:
lines = text.splitlines()
chapters: List[Chapter] = []
current_number: Optional[int] = None
current_title: str = ""
current_lines: List[str] = []
for line in lines:
heading = detect_chapter_heading(line)
if heading:
number, title = heading
if current_number is not None and current_lines:
body = "\n".join(current_lines).strip()
if body:
chapters.append(
Chapter(
index=current_number,
title=current_title or f"Capítulo {current_number}",
body=body,
hash=hashlib.sha1(body.encode("utf-8")).hexdigest(),
)
)
current_number = number
current_title = title
current_lines = []
elif current_number is not None:
current_lines.append(strip_timestamp(line))
if current_number is not None and current_lines:
body = "\n".join(current_lines).strip()
chapters.append(
Chapter(
index=current_number,
title=current_title or f"Capítulo {current_number}",
body=body,
hash=hashlib.sha1(body.encode("utf-8")).hexdigest(),
)
)
chapters.sort(key=lambda ch: ch.index)
return chapters
class GeminiError(RuntimeError):
"""Raised when the Gemini API does not return usable text."""
def load_api_key() -> str:
api_key = os.getenv("GEMINI_API_KEY")
if api_key:
return api_key
env_path = Path(".env")
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.strip().startswith("GEMINI_API_KEY"):
_, _, raw_value = line.partition("=")
value = raw_value.strip().strip("\"' ")
if value:
os.environ["GEMINI_API_KEY"] = value
return value
raise RuntimeError(
"La variable de entorno GEMINI_API_KEY no está configurada. "
"Define la clave en el entorno o en .env."
)
def call_gemini(
client: genai.Client,
model: str,
prompt: str,
*,
temperature: float = 0.4,
top_p: float = 0.95,
top_k: int = 32,
max_output_tokens: int = 1536,
) -> str:
"""Invoke the Gemini API with retries and return the generated text."""
config = genai_types.GenerateContentConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
max_output_tokens=max_output_tokens,
response_mime_type="text/plain",
response_modalities=["TEXT"],
)
for attempt in range(1, MAX_RETRIES + 1):
try:
response = client.models.generate_content(
model=model,
contents=prompt,
config=config,
)
except genai_errors.ClientError as exc:
wait_time: Optional[float] = None
if exc.code == 429:
wait_time = RATE_LIMIT_BACKOFF_SECONDS * attempt
print(
f"Advertencia: respuesta 429 (intento {attempt}/{MAX_RETRIES}): "
f"{exc.message}"
)
elif exc.code and exc.code >= 500:
wait_time = PAUSE_SECONDS * attempt
print(
f"Advertencia: error {exc.code} del modelo "
f"(intento {attempt}/{MAX_RETRIES}): {exc.message}"
)
else:
raise GeminiError(
f"Error {exc.code or 'desconocido'} llamando al modelo: {exc}"
) from exc
if wait_time is not None and attempt < MAX_RETRIES:
time.sleep(wait_time)
continue
raise GeminiError(
f"Falló la llamada al modelo tras reintentos: {exc}"
) from exc
except genai_errors.ServerError as exc:
wait_time = PAUSE_SECONDS * attempt
print(
f"Advertencia: error {exc.code} del servidor "
f"(intento {attempt}/{MAX_RETRIES}): {exc.message}"
)
if attempt < MAX_RETRIES:
time.sleep(wait_time)
continue
raise GeminiError(
f"Falló la llamada al modelo tras reintentos: {exc}"
) from exc
text = (response.text or "").strip()
if text:
return text
parts = []
for candidate in response.candidates or []:
for part in candidate.content.parts:
value = getattr(part, "text", None)
if value:
parts.append(value)
if parts:
return "\n".join(parts).strip()
raise GeminiError("La respuesta del modelo no contenía texto.")
raise GeminiError("Se agotaron los reintentos con la API de Gemini.")
def build_chunk_prompt(chapter: Chapter, idx: int, total: int) -> str:
return (
"Eres un historiador y estratega político. Necesito un dossier extenso, "
"rico en matices, que pueda ocupar aproximadamente una página impresa "
"para cada capítulo de \"El príncipe\". Mantén un tono analítico, claro y "
"elegante en español. Sigue esta estructura en cada respuesta:\n\n"
"#### Panorama narrativo\n"
"- Explica en 3-4 frases qué sucede en el capítulo y por qué importa.\n\n"
"#### Tácticas y lecciones\n"
"- Enumera 3-4 tácticas o ideas centrales con ejemplos del texto.\n\n"
"#### Recursos memorables\n"
"- Cita o parafrasea dos pasajes breves y comenta su significado.\n\n"
"#### Resonancias actuales\n"
"- Conecta las ideas con situaciones de liderazgo contemporáneo.\n\n"
f"Capítulo {chapter.index}/{total}: {chapter.title}\n"
"Transcripción base:\n"
f"\"\"\"\n{chapter.body}\n\"\"\""
)
def slugify_anchor(text: str) -> str:
normalized = unicodedata.normalize("NFD", text)
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
slug = re.sub(r"[^a-z0-9]+", "-", ascii_text.lower()).strip("-")
return slug or "capitulo"
def compose_markdown(chapters: List[Chapter], summaries: List[str]) -> str:
toc_lines: List[str] = []
for chapter in chapters:
anchor_text = f"Capítulo {chapter.index}: {chapter.title}"
anchor = slugify_anchor(anchor_text)
toc_lines.append(
f"{chapter.index}. [{anchor_text}](#{anchor})"
)
lines = [
'# Dossier capítulo a capítulo de "El príncipe"',
"> \"Es mejor ser temido que amado, si no se pueden tener ambas cosas.\"",
"---",
"## Tabla de contenidos",
"\n".join(toc_lines),
"",
]
for chapter, summary in zip(chapters, summaries):
anchor_text = f"Capítulo {chapter.index}: {chapter.title}"
anchor = slugify_anchor(anchor_text)
lines.extend(
[
f"<a name=\"{anchor}\"></a>",
f"### {anchor_text}",
summary.strip(),
"",
]
)
return "\n".join(lines).strip()
def current_text_signature() -> Dict[str, float]:
stats = TEXT_PATH.stat()
return {"size": stats.st_size, "mtime": stats.st_mtime}
def load_chunk_cache(
signature: Dict[str, float],
chapters: List[Chapter],
) -> Dict[int, str]:
if not CHUNK_CACHE_PATH.exists():
return {}
try:
data = json.loads(CHUNK_CACHE_PATH.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return {}
if data.get("text_signature") != signature:
return {}
chapter_map = {chapter.index: chapter for chapter in chapters}
cache: Dict[int, str] = {}
for entry in data.get("chapters", []):
idx = entry.get("index")
summary = entry.get("summary")
stored_hash = entry.get("hash")
chapter = chapter_map.get(idx)
if (
chapter
and isinstance(summary, str)
and stored_hash == chapter.hash
):
cache[idx] = summary
return cache
def persist_chunk_cache(
signature: Dict[str, float],
chapters: List[Chapter],
summaries: List[Optional[str]],
) -> None:
entries = [
{
"index": chapter.index,
"title": chapter.title,
"hash": chapter.hash,
"summary": summary,
}
for chapter, summary in zip(chapters, summaries)
if summary is not None
]
payload = {
"text_signature": signature,
"chapters": entries,
}
CHUNK_CACHE_PATH.write_text(
json.dumps(payload, ensure_ascii=False, indent=2),
encoding="utf-8",
)
def generate_summary() -> str:
api_key = load_api_key()
model = os.getenv("GEMINI_MODEL", DEFAULT_MODEL)
client = genai.Client(api_key=api_key)
text = TEXT_PATH.read_text(encoding="utf-8")
chapters = split_into_chapters(text)
if not chapters:
raise GeminiError("No se pudieron detectar capítulos en el texto.")
if len(chapters) != TARGET_PAGES:
print(
f"Advertencia: se detectaron {len(chapters)} capítulos, "
f"pero se esperaban {TARGET_PAGES}."
)
signature = current_text_signature()
cached = load_chunk_cache(signature, chapters)
chapter_summaries: List[Optional[str]] = [None] * len(chapters)
for pos, chapter in enumerate(chapters, start=1):
cached_summary = cached.get(chapter.index)
if cached_summary:
chapter_summaries[pos - 1] = cached_summary
total = len(chapters)
print(f"Procesando {total} capítulos con el modelo {model}...")
for idx, chapter in enumerate(chapters, start=1):
if chapter_summaries[idx - 1]:
print(f"- Capítulo {chapter.index}/{total} restaurado desde cache.")
continue
prompt = build_chunk_prompt(chapter, idx, total)
summary = call_gemini(client, model, prompt, max_output_tokens=1536)
chapter_summaries[idx - 1] = summary
persist_chunk_cache(signature, chapters, chapter_summaries)
print(
f"- Capítulo {chapter.index}/{total} sintetizado "
f"({len(chapter.body)} caracteres)."
)
time.sleep(PAUSE_SECONDS)
completed_summaries = cast(
List[str],
[s for s in chapter_summaries if s is not None],
)
if len(completed_summaries) != total:
raise GeminiError("No se completaron todos los fragmentos.")
final_markdown = compose_markdown(chapters, completed_summaries)
MD_OUTPUT.write_text(final_markdown.strip() + "\n", encoding="utf-8")
return final_markdown
def main() -> None:
final_markdown = generate_summary()
print(f"\nResumen Markdown guardado en: {MD_OUTPUT}")
PDF_OUTPUT.unlink(missing_ok=True) # remove archivo previo si existe
command = (
f"pandoc \"{MD_OUTPUT}\" --standalone --pdf-engine=xelatex "
f"-o \"{PDF_OUTPUT}\""
)
result = os.system(command)
if result != 0:
raise RuntimeError("pandoc no pudo crear el PDF. Revisa que LaTeX esté instalado.")
print(f"PDF generado en: {PDF_OUTPUT}")
if __name__ == "__main__":
main()