Initial commit: Python script for summarizing "El Principe"
Added summarize_el_principe.py script with: - Text processing and chunking functionality - OpenAI API integration for summarization - JSON chunk summaries storage - PDF report generation - Configuration management via .env file Includes: - Source text "el principe.txt" - Generated summary "resumen_el_principe.md" - PDF output "resumen_el_principe.pdf" - Chunk summaries "chunk_summaries.json" 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
457
summarize_el_principe.py
Normal file
457
summarize_el_principe.py
Normal file
@@ -0,0 +1,457 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate a multi-step summary of `el principe.txt` using Gemini, writing both
|
||||
Markdown and PDF outputs. The script chunks the source text to stay within API
|
||||
limits, requests partial summaries, and then asks the model for a polished,
|
||||
styled Markdown document that we later convert to PDF.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import unicodedata
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, cast
|
||||
|
||||
from google import genai
|
||||
from google.genai import errors as genai_errors
|
||||
from google.genai import types as genai_types
|
||||
|
||||
TEXT_PATH = Path("el principe.txt")
|
||||
MD_OUTPUT = Path("resumen_el_principe.md")
|
||||
PDF_OUTPUT = Path("resumen_el_principe.pdf")
|
||||
CHUNK_CACHE_PATH = Path("chunk_summaries.json")
|
||||
|
||||
PAUSE_SECONDS = 3.0 # small pause between successful calls
|
||||
RATE_LIMIT_BACKOFF_SECONDS = 20.0 # wait time (per attempt) after 429s
|
||||
MAX_RETRIES = 6
|
||||
|
||||
DEFAULT_MODEL = "gemini-2.0-flash"
|
||||
|
||||
TARGET_PAGES = 26 # número de capítulos esperados
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chapter:
|
||||
index: int
|
||||
title: str
|
||||
body: str
|
||||
hash: str
|
||||
|
||||
|
||||
TIMESTAMP_RE = re.compile(r"^\s*\[\d{2}:\d{2}:\d{2}\]\s*")
|
||||
|
||||
|
||||
def strip_timestamp(line: str) -> str:
|
||||
return TIMESTAMP_RE.sub("", line, count=1)
|
||||
|
||||
|
||||
ROMAN_VALUES = {
|
||||
"I": 1,
|
||||
"V": 5,
|
||||
"X": 10,
|
||||
"L": 50,
|
||||
"C": 100,
|
||||
"D": 500,
|
||||
"M": 1000,
|
||||
}
|
||||
|
||||
|
||||
def roman_to_int(value: str) -> Optional[int]:
|
||||
value = value.upper()
|
||||
total = 0
|
||||
prev = 0
|
||||
for char in reversed(value):
|
||||
if char not in ROMAN_VALUES:
|
||||
return None
|
||||
current = ROMAN_VALUES[char]
|
||||
if current < prev:
|
||||
total -= current
|
||||
else:
|
||||
total += current
|
||||
prev = current
|
||||
return total
|
||||
|
||||
|
||||
def detect_chapter_heading(line: str) -> Optional[Tuple[int, str]]:
|
||||
clean = strip_timestamp(line).strip()
|
||||
if not clean:
|
||||
return None
|
||||
|
||||
match = re.match(
|
||||
r"(?i)cap[ií]tulo\s+([ivxlcdm]+|\d+)(?:\s*[\.\-:]?\s*(.*))?",
|
||||
clean,
|
||||
)
|
||||
if match:
|
||||
raw_label = match.group(1)
|
||||
rest = (match.group(2) or "").strip(" .:-")
|
||||
number = (
|
||||
int(raw_label)
|
||||
if raw_label.isdigit()
|
||||
else roman_to_int(raw_label)
|
||||
)
|
||||
if number:
|
||||
title = rest or f"Capítulo {number}"
|
||||
return number, title
|
||||
|
||||
match = re.match(r"(\d{1,2})\.\s*([A-ZÁÉÍÓÚ].*)", clean)
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
title = match.group(2).strip(" .:-") or f"Capítulo {number}"
|
||||
return number, title
|
||||
|
||||
match = re.match(r"(?i)(?:puesto|p)\.?\s*(\d{1,2})\.\s*(.*)", clean)
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
title = match.group(2).strip(" .:-") or f"Capítulo {number}"
|
||||
return number, title
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def split_into_chapters(text: str) -> List[Chapter]:
|
||||
lines = text.splitlines()
|
||||
chapters: List[Chapter] = []
|
||||
current_number: Optional[int] = None
|
||||
current_title: str = ""
|
||||
current_lines: List[str] = []
|
||||
|
||||
for line in lines:
|
||||
heading = detect_chapter_heading(line)
|
||||
if heading:
|
||||
number, title = heading
|
||||
if current_number is not None and current_lines:
|
||||
body = "\n".join(current_lines).strip()
|
||||
if body:
|
||||
chapters.append(
|
||||
Chapter(
|
||||
index=current_number,
|
||||
title=current_title or f"Capítulo {current_number}",
|
||||
body=body,
|
||||
hash=hashlib.sha1(body.encode("utf-8")).hexdigest(),
|
||||
)
|
||||
)
|
||||
current_number = number
|
||||
current_title = title
|
||||
current_lines = []
|
||||
elif current_number is not None:
|
||||
current_lines.append(strip_timestamp(line))
|
||||
|
||||
if current_number is not None and current_lines:
|
||||
body = "\n".join(current_lines).strip()
|
||||
chapters.append(
|
||||
Chapter(
|
||||
index=current_number,
|
||||
title=current_title or f"Capítulo {current_number}",
|
||||
body=body,
|
||||
hash=hashlib.sha1(body.encode("utf-8")).hexdigest(),
|
||||
)
|
||||
)
|
||||
|
||||
chapters.sort(key=lambda ch: ch.index)
|
||||
return chapters
|
||||
|
||||
|
||||
class GeminiError(RuntimeError):
|
||||
"""Raised when the Gemini API does not return usable text."""
|
||||
|
||||
|
||||
def load_api_key() -> str:
|
||||
api_key = os.getenv("GEMINI_API_KEY")
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
env_path = Path(".env")
|
||||
if env_path.exists():
|
||||
for line in env_path.read_text().splitlines():
|
||||
if line.strip().startswith("GEMINI_API_KEY"):
|
||||
_, _, raw_value = line.partition("=")
|
||||
value = raw_value.strip().strip("\"' ")
|
||||
if value:
|
||||
os.environ["GEMINI_API_KEY"] = value
|
||||
return value
|
||||
|
||||
raise RuntimeError(
|
||||
"La variable de entorno GEMINI_API_KEY no está configurada. "
|
||||
"Define la clave en el entorno o en .env."
|
||||
)
|
||||
|
||||
|
||||
def call_gemini(
|
||||
client: genai.Client,
|
||||
model: str,
|
||||
prompt: str,
|
||||
*,
|
||||
temperature: float = 0.4,
|
||||
top_p: float = 0.95,
|
||||
top_k: int = 32,
|
||||
max_output_tokens: int = 1536,
|
||||
) -> str:
|
||||
"""Invoke the Gemini API with retries and return the generated text."""
|
||||
config = genai_types.GenerateContentConfig(
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
top_k=top_k,
|
||||
max_output_tokens=max_output_tokens,
|
||||
response_mime_type="text/plain",
|
||||
response_modalities=["TEXT"],
|
||||
)
|
||||
|
||||
for attempt in range(1, MAX_RETRIES + 1):
|
||||
try:
|
||||
response = client.models.generate_content(
|
||||
model=model,
|
||||
contents=prompt,
|
||||
config=config,
|
||||
)
|
||||
except genai_errors.ClientError as exc:
|
||||
wait_time: Optional[float] = None
|
||||
if exc.code == 429:
|
||||
wait_time = RATE_LIMIT_BACKOFF_SECONDS * attempt
|
||||
print(
|
||||
f"Advertencia: respuesta 429 (intento {attempt}/{MAX_RETRIES}): "
|
||||
f"{exc.message}"
|
||||
)
|
||||
elif exc.code and exc.code >= 500:
|
||||
wait_time = PAUSE_SECONDS * attempt
|
||||
print(
|
||||
f"Advertencia: error {exc.code} del modelo "
|
||||
f"(intento {attempt}/{MAX_RETRIES}): {exc.message}"
|
||||
)
|
||||
else:
|
||||
raise GeminiError(
|
||||
f"Error {exc.code or 'desconocido'} llamando al modelo: {exc}"
|
||||
) from exc
|
||||
|
||||
if wait_time is not None and attempt < MAX_RETRIES:
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
raise GeminiError(
|
||||
f"Falló la llamada al modelo tras reintentos: {exc}"
|
||||
) from exc
|
||||
except genai_errors.ServerError as exc:
|
||||
wait_time = PAUSE_SECONDS * attempt
|
||||
print(
|
||||
f"Advertencia: error {exc.code} del servidor "
|
||||
f"(intento {attempt}/{MAX_RETRIES}): {exc.message}"
|
||||
)
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
raise GeminiError(
|
||||
f"Falló la llamada al modelo tras reintentos: {exc}"
|
||||
) from exc
|
||||
|
||||
text = (response.text or "").strip()
|
||||
if text:
|
||||
return text
|
||||
|
||||
parts = []
|
||||
for candidate in response.candidates or []:
|
||||
for part in candidate.content.parts:
|
||||
value = getattr(part, "text", None)
|
||||
if value:
|
||||
parts.append(value)
|
||||
if parts:
|
||||
return "\n".join(parts).strip()
|
||||
|
||||
raise GeminiError("La respuesta del modelo no contenía texto.")
|
||||
|
||||
raise GeminiError("Se agotaron los reintentos con la API de Gemini.")
|
||||
|
||||
|
||||
def build_chunk_prompt(chapter: Chapter, idx: int, total: int) -> str:
|
||||
return (
|
||||
"Eres un historiador y estratega político. Necesito un dossier extenso, "
|
||||
"rico en matices, que pueda ocupar aproximadamente una página impresa "
|
||||
"para cada capítulo de \"El príncipe\". Mantén un tono analítico, claro y "
|
||||
"elegante en español. Sigue esta estructura en cada respuesta:\n\n"
|
||||
"#### Panorama narrativo\n"
|
||||
"- Explica en 3-4 frases qué sucede en el capítulo y por qué importa.\n\n"
|
||||
"#### Tácticas y lecciones\n"
|
||||
"- Enumera 3-4 tácticas o ideas centrales con ejemplos del texto.\n\n"
|
||||
"#### Recursos memorables\n"
|
||||
"- Cita o parafrasea dos pasajes breves y comenta su significado.\n\n"
|
||||
"#### Resonancias actuales\n"
|
||||
"- Conecta las ideas con situaciones de liderazgo contemporáneo.\n\n"
|
||||
f"Capítulo {chapter.index}/{total}: {chapter.title}\n"
|
||||
"Transcripción base:\n"
|
||||
f"\"\"\"\n{chapter.body}\n\"\"\""
|
||||
)
|
||||
|
||||
|
||||
def slugify_anchor(text: str) -> str:
|
||||
normalized = unicodedata.normalize("NFD", text)
|
||||
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", ascii_text.lower()).strip("-")
|
||||
return slug or "capitulo"
|
||||
|
||||
|
||||
def compose_markdown(chapters: List[Chapter], summaries: List[str]) -> str:
|
||||
toc_lines: List[str] = []
|
||||
for chapter in chapters:
|
||||
anchor_text = f"Capítulo {chapter.index}: {chapter.title}"
|
||||
anchor = slugify_anchor(anchor_text)
|
||||
toc_lines.append(
|
||||
f"{chapter.index}. [{anchor_text}](#{anchor})"
|
||||
)
|
||||
|
||||
lines = [
|
||||
'# Dossier capítulo a capítulo de "El príncipe"',
|
||||
"> \"Es mejor ser temido que amado, si no se pueden tener ambas cosas.\"",
|
||||
"---",
|
||||
"## Tabla de contenidos",
|
||||
"\n".join(toc_lines),
|
||||
"",
|
||||
]
|
||||
|
||||
for chapter, summary in zip(chapters, summaries):
|
||||
anchor_text = f"Capítulo {chapter.index}: {chapter.title}"
|
||||
anchor = slugify_anchor(anchor_text)
|
||||
lines.extend(
|
||||
[
|
||||
f"<a name=\"{anchor}\"></a>",
|
||||
f"### {anchor_text}",
|
||||
summary.strip(),
|
||||
"",
|
||||
]
|
||||
)
|
||||
|
||||
return "\n".join(lines).strip()
|
||||
|
||||
|
||||
def current_text_signature() -> Dict[str, float]:
|
||||
stats = TEXT_PATH.stat()
|
||||
return {"size": stats.st_size, "mtime": stats.st_mtime}
|
||||
|
||||
|
||||
def load_chunk_cache(
|
||||
signature: Dict[str, float],
|
||||
chapters: List[Chapter],
|
||||
) -> Dict[int, str]:
|
||||
if not CHUNK_CACHE_PATH.exists():
|
||||
return {}
|
||||
try:
|
||||
data = json.loads(CHUNK_CACHE_PATH.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
if data.get("text_signature") != signature:
|
||||
return {}
|
||||
|
||||
chapter_map = {chapter.index: chapter for chapter in chapters}
|
||||
cache: Dict[int, str] = {}
|
||||
for entry in data.get("chapters", []):
|
||||
idx = entry.get("index")
|
||||
summary = entry.get("summary")
|
||||
stored_hash = entry.get("hash")
|
||||
chapter = chapter_map.get(idx)
|
||||
if (
|
||||
chapter
|
||||
and isinstance(summary, str)
|
||||
and stored_hash == chapter.hash
|
||||
):
|
||||
cache[idx] = summary
|
||||
return cache
|
||||
|
||||
|
||||
def persist_chunk_cache(
|
||||
signature: Dict[str, float],
|
||||
chapters: List[Chapter],
|
||||
summaries: List[Optional[str]],
|
||||
) -> None:
|
||||
entries = [
|
||||
{
|
||||
"index": chapter.index,
|
||||
"title": chapter.title,
|
||||
"hash": chapter.hash,
|
||||
"summary": summary,
|
||||
}
|
||||
for chapter, summary in zip(chapters, summaries)
|
||||
if summary is not None
|
||||
]
|
||||
payload = {
|
||||
"text_signature": signature,
|
||||
"chapters": entries,
|
||||
}
|
||||
CHUNK_CACHE_PATH.write_text(
|
||||
json.dumps(payload, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def generate_summary() -> str:
|
||||
api_key = load_api_key()
|
||||
model = os.getenv("GEMINI_MODEL", DEFAULT_MODEL)
|
||||
client = genai.Client(api_key=api_key)
|
||||
text = TEXT_PATH.read_text(encoding="utf-8")
|
||||
chapters = split_into_chapters(text)
|
||||
if not chapters:
|
||||
raise GeminiError("No se pudieron detectar capítulos en el texto.")
|
||||
if len(chapters) != TARGET_PAGES:
|
||||
print(
|
||||
f"Advertencia: se detectaron {len(chapters)} capítulos, "
|
||||
f"pero se esperaban {TARGET_PAGES}."
|
||||
)
|
||||
|
||||
signature = current_text_signature()
|
||||
cached = load_chunk_cache(signature, chapters)
|
||||
|
||||
chapter_summaries: List[Optional[str]] = [None] * len(chapters)
|
||||
for pos, chapter in enumerate(chapters, start=1):
|
||||
cached_summary = cached.get(chapter.index)
|
||||
if cached_summary:
|
||||
chapter_summaries[pos - 1] = cached_summary
|
||||
|
||||
total = len(chapters)
|
||||
print(f"Procesando {total} capítulos con el modelo {model}...")
|
||||
|
||||
for idx, chapter in enumerate(chapters, start=1):
|
||||
if chapter_summaries[idx - 1]:
|
||||
print(f"- Capítulo {chapter.index}/{total} restaurado desde cache.")
|
||||
continue
|
||||
|
||||
prompt = build_chunk_prompt(chapter, idx, total)
|
||||
summary = call_gemini(client, model, prompt, max_output_tokens=1536)
|
||||
chapter_summaries[idx - 1] = summary
|
||||
persist_chunk_cache(signature, chapters, chapter_summaries)
|
||||
print(
|
||||
f"- Capítulo {chapter.index}/{total} sintetizado "
|
||||
f"({len(chapter.body)} caracteres)."
|
||||
)
|
||||
time.sleep(PAUSE_SECONDS)
|
||||
|
||||
completed_summaries = cast(
|
||||
List[str],
|
||||
[s for s in chapter_summaries if s is not None],
|
||||
)
|
||||
if len(completed_summaries) != total:
|
||||
raise GeminiError("No se completaron todos los fragmentos.")
|
||||
|
||||
final_markdown = compose_markdown(chapters, completed_summaries)
|
||||
MD_OUTPUT.write_text(final_markdown.strip() + "\n", encoding="utf-8")
|
||||
return final_markdown
|
||||
|
||||
|
||||
def main() -> None:
|
||||
final_markdown = generate_summary()
|
||||
print(f"\nResumen Markdown guardado en: {MD_OUTPUT}")
|
||||
|
||||
PDF_OUTPUT.unlink(missing_ok=True) # remove archivo previo si existe
|
||||
command = (
|
||||
f"pandoc \"{MD_OUTPUT}\" --standalone --pdf-engine=xelatex "
|
||||
f"-o \"{PDF_OUTPUT}\""
|
||||
)
|
||||
result = os.system(command)
|
||||
if result != 0:
|
||||
raise RuntimeError("pandoc no pudo crear el PDF. Revisa que LaTeX esté instalado.")
|
||||
|
||||
print(f"PDF generado en: {PDF_OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user