Initial commit: Python script for summarizing "El Principe"

Added summarize_el_principe.py script with: - Text processing and chunking functionality - OpenAI API integration for summarization - JSON chunk summaries storage - PDF report generation - Configuration management via .env file Includes: - Source text "el principe.txt" - Generated summary "resumen_el_principe.md" - PDF output "resumen_el_principe.pdf" - Chunk summaries "chunk_summaries.json" 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-10 21:31:00 +00:00
commit 2ab4887847
6 changed files with 3235 additions and 0 deletions
--- a/summarize_el_principe.py
+++ b/summarize_el_principe.py
@@ -0,0 +1,457 @@
+#!/usr/bin/env python3
+"""
+Generate a multi-step summary of `el principe.txt` using Gemini, writing both
+Markdown and PDF outputs. The script chunks the source text to stay within API
+limits, requests partial summaries, and then asks the model for a polished,
+styled Markdown document that we later convert to PDF.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+import re
+import time
+import unicodedata
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, cast
+
+from google import genai
+from google.genai import errors as genai_errors
+from google.genai import types as genai_types
+
+TEXT_PATH = Path("el principe.txt")
+MD_OUTPUT = Path("resumen_el_principe.md")
+PDF_OUTPUT = Path("resumen_el_principe.pdf")
+CHUNK_CACHE_PATH = Path("chunk_summaries.json")
+
+PAUSE_SECONDS = 3.0  # small pause between successful calls
+RATE_LIMIT_BACKOFF_SECONDS = 20.0  # wait time (per attempt) after 429s
+MAX_RETRIES = 6
+
+DEFAULT_MODEL = "gemini-2.0-flash"
+
+TARGET_PAGES = 26  # número de capítulos esperados
+
+
+@dataclass
+class Chapter:
+    index: int
+    title: str
+    body: str
+    hash: str
+
+
+TIMESTAMP_RE = re.compile(r"^\s*\[\d{2}:\d{2}:\d{2}\]\s*")
+
+
+def strip_timestamp(line: str) -> str:
+    return TIMESTAMP_RE.sub("", line, count=1)
+
+
+ROMAN_VALUES = {
+    "I": 1,
+    "V": 5,
+    "X": 10,
+    "L": 50,
+    "C": 100,
+    "D": 500,
+    "M": 1000,
+}
+
+
+def roman_to_int(value: str) -> Optional[int]:
+    value = value.upper()
+    total = 0
+    prev = 0
+    for char in reversed(value):
+        if char not in ROMAN_VALUES:
+            return None
+        current = ROMAN_VALUES[char]
+        if current < prev:
+            total -= current
+        else:
+            total += current
+            prev = current
+    return total
+
+
+def detect_chapter_heading(line: str) -> Optional[Tuple[int, str]]:
+    clean = strip_timestamp(line).strip()
+    if not clean:
+        return None
+
+    match = re.match(
+        r"(?i)cap[ií]tulo\s+([ivxlcdm]+|\d+)(?:\s*[\.\-:]?\s*(.*))?",
+        clean,
+    )
+    if match:
+        raw_label = match.group(1)
+        rest = (match.group(2) or "").strip(" .:-")
+        number = (
+            int(raw_label)
+            if raw_label.isdigit()
+            else roman_to_int(raw_label)
+        )
+        if number:
+            title = rest or f"Capítulo {number}"
+            return number, title
+
+    match = re.match(r"(\d{1,2})\.\s*([A-ZÁÉÍÓÚ].*)", clean)
+    if match:
+        number = int(match.group(1))
+        title = match.group(2).strip(" .:-") or f"Capítulo {number}"
+        return number, title
+
+    match = re.match(r"(?i)(?:puesto|p)\.?\s*(\d{1,2})\.\s*(.*)", clean)
+    if match:
+        number = int(match.group(1))
+        title = match.group(2).strip(" .:-") or f"Capítulo {number}"
+        return number, title
+
+    return None
+
+
+def split_into_chapters(text: str) -> List[Chapter]:
+    lines = text.splitlines()
+    chapters: List[Chapter] = []
+    current_number: Optional[int] = None
+    current_title: str = ""
+    current_lines: List[str] = []
+
+    for line in lines:
+        heading = detect_chapter_heading(line)
+        if heading:
+            number, title = heading
+            if current_number is not None and current_lines:
+                body = "\n".join(current_lines).strip()
+                if body:
+                    chapters.append(
+                        Chapter(
+                            index=current_number,
+                            title=current_title or f"Capítulo {current_number}",
+                            body=body,
+                            hash=hashlib.sha1(body.encode("utf-8")).hexdigest(),
+                        )
+                    )
+            current_number = number
+            current_title = title
+            current_lines = []
+        elif current_number is not None:
+            current_lines.append(strip_timestamp(line))
+
+    if current_number is not None and current_lines:
+        body = "\n".join(current_lines).strip()
+        chapters.append(
+            Chapter(
+                index=current_number,
+                title=current_title or f"Capítulo {current_number}",
+                body=body,
+                hash=hashlib.sha1(body.encode("utf-8")).hexdigest(),
+            )
+        )
+
+    chapters.sort(key=lambda ch: ch.index)
+    return chapters
+
+
+class GeminiError(RuntimeError):
+    """Raised when the Gemini API does not return usable text."""
+
+
+def load_api_key() -> str:
+    api_key = os.getenv("GEMINI_API_KEY")
+    if api_key:
+        return api_key
+
+    env_path = Path(".env")
+    if env_path.exists():
+        for line in env_path.read_text().splitlines():
+            if line.strip().startswith("GEMINI_API_KEY"):
+                _, _, raw_value = line.partition("=")
+                value = raw_value.strip().strip("\"' ")
+                if value:
+                    os.environ["GEMINI_API_KEY"] = value
+                    return value
+
+    raise RuntimeError(
+        "La variable de entorno GEMINI_API_KEY no está configurada. "
+        "Define la clave en el entorno o en .env."
+    )
+
+
+def call_gemini(
+    client: genai.Client,
+    model: str,
+    prompt: str,
+    *,
+    temperature: float = 0.4,
+    top_p: float = 0.95,
+    top_k: int = 32,
+    max_output_tokens: int = 1536,
+) -> str:
+    """Invoke the Gemini API with retries and return the generated text."""
+    config = genai_types.GenerateContentConfig(
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        max_output_tokens=max_output_tokens,
+        response_mime_type="text/plain",
+        response_modalities=["TEXT"],
+    )
+
+    for attempt in range(1, MAX_RETRIES + 1):
+        try:
+            response = client.models.generate_content(
+                model=model,
+                contents=prompt,
+                config=config,
+            )
+        except genai_errors.ClientError as exc:
+            wait_time: Optional[float] = None
+            if exc.code == 429:
+                wait_time = RATE_LIMIT_BACKOFF_SECONDS * attempt
+                print(
+                    f"Advertencia: respuesta 429 (intento {attempt}/{MAX_RETRIES}): "
+                    f"{exc.message}"
+                )
+            elif exc.code and exc.code >= 500:
+                wait_time = PAUSE_SECONDS * attempt
+                print(
+                    f"Advertencia: error {exc.code} del modelo "
+                    f"(intento {attempt}/{MAX_RETRIES}): {exc.message}"
+                )
+            else:
+                raise GeminiError(
+                    f"Error {exc.code or 'desconocido'} llamando al modelo: {exc}"
+                ) from exc
+
+            if wait_time is not None and attempt < MAX_RETRIES:
+                time.sleep(wait_time)
+                continue
+            raise GeminiError(
+                f"Falló la llamada al modelo tras reintentos: {exc}"
+            ) from exc
+        except genai_errors.ServerError as exc:
+            wait_time = PAUSE_SECONDS * attempt
+            print(
+                f"Advertencia: error {exc.code} del servidor "
+                f"(intento {attempt}/{MAX_RETRIES}): {exc.message}"
+            )
+            if attempt < MAX_RETRIES:
+                time.sleep(wait_time)
+                continue
+            raise GeminiError(
+                f"Falló la llamada al modelo tras reintentos: {exc}"
+            ) from exc
+
+        text = (response.text or "").strip()
+        if text:
+            return text
+
+        parts = []
+        for candidate in response.candidates or []:
+            for part in candidate.content.parts:
+                value = getattr(part, "text", None)
+                if value:
+                    parts.append(value)
+        if parts:
+            return "\n".join(parts).strip()
+
+        raise GeminiError("La respuesta del modelo no contenía texto.")
+
+    raise GeminiError("Se agotaron los reintentos con la API de Gemini.")
+
+
+def build_chunk_prompt(chapter: Chapter, idx: int, total: int) -> str:
+    return (
+        "Eres un historiador y estratega político. Necesito un dossier extenso, "
+        "rico en matices, que pueda ocupar aproximadamente una página impresa "
+        "para cada capítulo de \"El príncipe\". Mantén un tono analítico, claro y "
+        "elegante en español. Sigue esta estructura en cada respuesta:\n\n"
+        "#### Panorama narrativo\n"
+        "- Explica en 3-4 frases qué sucede en el capítulo y por qué importa.\n\n"
+        "#### Tácticas y lecciones\n"
+        "- Enumera 3-4 tácticas o ideas centrales con ejemplos del texto.\n\n"
+        "#### Recursos memorables\n"
+        "- Cita o parafrasea dos pasajes breves y comenta su significado.\n\n"
+        "#### Resonancias actuales\n"
+        "- Conecta las ideas con situaciones de liderazgo contemporáneo.\n\n"
+        f"Capítulo {chapter.index}/{total}: {chapter.title}\n"
+        "Transcripción base:\n"
+        f"\"\"\"\n{chapter.body}\n\"\"\""
+    )
+
+
+def slugify_anchor(text: str) -> str:
+    normalized = unicodedata.normalize("NFD", text)
+    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
+    slug = re.sub(r"[^a-z0-9]+", "-", ascii_text.lower()).strip("-")
+    return slug or "capitulo"
+
+
+def compose_markdown(chapters: List[Chapter], summaries: List[str]) -> str:
+    toc_lines: List[str] = []
+    for chapter in chapters:
+        anchor_text = f"Capítulo {chapter.index}: {chapter.title}"
+        anchor = slugify_anchor(anchor_text)
+        toc_lines.append(
+            f"{chapter.index}. [{anchor_text}](#{anchor})"
+        )
+
+    lines = [
+        '# Dossier capítulo a capítulo de "El príncipe"',
+        "> \"Es mejor ser temido que amado, si no se pueden tener ambas cosas.\"",
+        "---",
+        "## Tabla de contenidos",
+        "\n".join(toc_lines),
+        "",
+    ]
+
+    for chapter, summary in zip(chapters, summaries):
+        anchor_text = f"Capítulo {chapter.index}: {chapter.title}"
+        anchor = slugify_anchor(anchor_text)
+        lines.extend(
+            [
+                f"<a name=\"{anchor}\"></a>",
+                f"### {anchor_text}",
+                summary.strip(),
+                "",
+            ]
+        )
+
+    return "\n".join(lines).strip()
+
+
+def current_text_signature() -> Dict[str, float]:
+    stats = TEXT_PATH.stat()
+    return {"size": stats.st_size, "mtime": stats.st_mtime}
+
+
+def load_chunk_cache(
+    signature: Dict[str, float],
+    chapters: List[Chapter],
+) -> Dict[int, str]:
+    if not CHUNK_CACHE_PATH.exists():
+        return {}
+    try:
+        data = json.loads(CHUNK_CACHE_PATH.read_text(encoding="utf-8"))
+    except json.JSONDecodeError:
+        return {}
+    if data.get("text_signature") != signature:
+        return {}
+
+    chapter_map = {chapter.index: chapter for chapter in chapters}
+    cache: Dict[int, str] = {}
+    for entry in data.get("chapters", []):
+        idx = entry.get("index")
+        summary = entry.get("summary")
+        stored_hash = entry.get("hash")
+        chapter = chapter_map.get(idx)
+        if (
+            chapter
+            and isinstance(summary, str)
+            and stored_hash == chapter.hash
+        ):
+            cache[idx] = summary
+    return cache
+
+
+def persist_chunk_cache(
+    signature: Dict[str, float],
+    chapters: List[Chapter],
+    summaries: List[Optional[str]],
+) -> None:
+    entries = [
+        {
+            "index": chapter.index,
+            "title": chapter.title,
+            "hash": chapter.hash,
+            "summary": summary,
+        }
+        for chapter, summary in zip(chapters, summaries)
+        if summary is not None
+    ]
+    payload = {
+        "text_signature": signature,
+        "chapters": entries,
+    }
+    CHUNK_CACHE_PATH.write_text(
+        json.dumps(payload, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+
+
+def generate_summary() -> str:
+    api_key = load_api_key()
+    model = os.getenv("GEMINI_MODEL", DEFAULT_MODEL)
+    client = genai.Client(api_key=api_key)
+    text = TEXT_PATH.read_text(encoding="utf-8")
+    chapters = split_into_chapters(text)
+    if not chapters:
+        raise GeminiError("No se pudieron detectar capítulos en el texto.")
+    if len(chapters) != TARGET_PAGES:
+        print(
+            f"Advertencia: se detectaron {len(chapters)} capítulos, "
+            f"pero se esperaban {TARGET_PAGES}."
+        )
+
+    signature = current_text_signature()
+    cached = load_chunk_cache(signature, chapters)
+
+    chapter_summaries: List[Optional[str]] = [None] * len(chapters)
+    for pos, chapter in enumerate(chapters, start=1):
+        cached_summary = cached.get(chapter.index)
+        if cached_summary:
+            chapter_summaries[pos - 1] = cached_summary
+
+    total = len(chapters)
+    print(f"Procesando {total} capítulos con el modelo {model}...")
+
+    for idx, chapter in enumerate(chapters, start=1):
+        if chapter_summaries[idx - 1]:
+            print(f"- Capítulo {chapter.index}/{total} restaurado desde cache.")
+            continue
+
+        prompt = build_chunk_prompt(chapter, idx, total)
+        summary = call_gemini(client, model, prompt, max_output_tokens=1536)
+        chapter_summaries[idx - 1] = summary
+        persist_chunk_cache(signature, chapters, chapter_summaries)
+        print(
+            f"- Capítulo {chapter.index}/{total} sintetizado "
+            f"({len(chapter.body)} caracteres)."
+        )
+        time.sleep(PAUSE_SECONDS)
+
+    completed_summaries = cast(
+        List[str],
+        [s for s in chapter_summaries if s is not None],
+    )
+    if len(completed_summaries) != total:
+        raise GeminiError("No se completaron todos los fragmentos.")
+
+    final_markdown = compose_markdown(chapters, completed_summaries)
+    MD_OUTPUT.write_text(final_markdown.strip() + "\n", encoding="utf-8")
+    return final_markdown
+
+
+def main() -> None:
+    final_markdown = generate_summary()
+    print(f"\nResumen Markdown guardado en: {MD_OUTPUT}")
+
+    PDF_OUTPUT.unlink(missing_ok=True)  # remove archivo previo si existe
+    command = (
+        f"pandoc \"{MD_OUTPUT}\" --standalone --pdf-engine=xelatex "
+        f"-o \"{PDF_OUTPUT}\""
+    )
+    result = os.system(command)
+    if result != 0:
+        raise RuntimeError("pandoc no pudo crear el PDF. Revisa que LaTeX esté instalado.")
+
+    print(f"PDF generado en: {PDF_OUTPUT}")
+
+
+if __name__ == "__main__":
+    main()