resumen-largos/summarize_el_principe.py

#!/usr/bin/env python3
"""
Generate a multi-step summary of `el principe.txt` using Gemini, writing both
Markdown and PDF outputs. The script chunks the source text to stay within API
limits, requests partial summaries, and then asks the model for a polished,
styled Markdown document that we later convert to PDF.
"""

from __future__ import annotations

import hashlib
import json
import os
import re
import time
import unicodedata
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, cast

from google import genai
from google.genai import errors as genai_errors
from google.genai import types as genai_types

TEXT_PATH = Path("el principe.txt")
MD_OUTPUT = Path("resumen_el_principe.md")
PDF_OUTPUT = Path("resumen_el_principe.pdf")
CHUNK_CACHE_PATH = Path("chunk_summaries.json")

PAUSE_SECONDS = 3.0  # small pause between successful calls
RATE_LIMIT_BACKOFF_SECONDS = 20.0  # wait time (per attempt) after 429s
MAX_RETRIES = 6

DEFAULT_MODEL = "gemini-2.0-flash"

TARGET_PAGES = 26  # número de capítulos esperados


@dataclass
class Chapter:
    index: int
    title: str
    body: str
    hash: str


TIMESTAMP_RE = re.compile(r"^\s*\[\d{2}:\d{2}:\d{2}\]\s*")


def strip_timestamp(line: str) -> str:
    return TIMESTAMP_RE.sub("", line, count=1)


ROMAN_VALUES = {
    "I": 1,
    "V": 5,
    "X": 10,
    "L": 50,
    "C": 100,
    "D": 500,
    "M": 1000,
}


def roman_to_int(value: str) -> Optional[int]:
    value = value.upper()
    total = 0
    prev = 0
    for char in reversed(value):
        if char not in ROMAN_VALUES:
            return None
        current = ROMAN_VALUES[char]
        if current < prev:
            total -= current
        else:
            total += current
            prev = current
    return total


def detect_chapter_heading(line: str) -> Optional[Tuple[int, str]]:
    clean = strip_timestamp(line).strip()
    if not clean:
        return None

    match = re.match(
        r"(?i)cap[ií]tulo\s+([ivxlcdm]+|\d+)(?:\s*[\.\-:]?\s*(.*))?",
        clean,
    )
    if match:
        raw_label = match.group(1)
        rest = (match.group(2) or "").strip(" .:-")
        number = (
            int(raw_label)
            if raw_label.isdigit()
            else roman_to_int(raw_label)
        )
        if number:
            title = rest or f"Capítulo {number}"
            return number, title

    match = re.match(r"(\d{1,2})\.\s*([A-ZÁÉÍÓÚ].*)", clean)
    if match:
        number = int(match.group(1))
        title = match.group(2).strip(" .:-") or f"Capítulo {number}"
        return number, title

    match = re.match(r"(?i)(?:puesto|p)\.?\s*(\d{1,2})\.\s*(.*)", clean)
    if match:
        number = int(match.group(1))
        title = match.group(2).strip(" .:-") or f"Capítulo {number}"
        return number, title

    return None


def split_into_chapters(text: str) -> List[Chapter]:
    lines = text.splitlines()
    chapters: List[Chapter] = []
    current_number: Optional[int] = None
    current_title: str = ""
    current_lines: List[str] = []

    for line in lines:
        heading = detect_chapter_heading(line)
        if heading:
            number, title = heading
            if current_number is not None and current_lines:
                body = "\n".join(current_lines).strip()
                if body:
                    chapters.append(
                        Chapter(
                            index=current_number,
                            title=current_title or f"Capítulo {current_number}",
                            body=body,
                            hash=hashlib.sha1(body.encode("utf-8")).hexdigest(),
                        )
                    )
            current_number = number
            current_title = title
            current_lines = []
        elif current_number is not None:
            current_lines.append(strip_timestamp(line))

    if current_number is not None and current_lines:
        body = "\n".join(current_lines).strip()
        chapters.append(
            Chapter(
                index=current_number,
                title=current_title or f"Capítulo {current_number}",
                body=body,
                hash=hashlib.sha1(body.encode("utf-8")).hexdigest(),
            )
        )

    chapters.sort(key=lambda ch: ch.index)
    return chapters


class GeminiError(RuntimeError):
    """Raised when the Gemini API does not return usable text."""


def load_api_key() -> str:
    api_key = os.getenv("GEMINI_API_KEY")
    if api_key:
        return api_key

    env_path = Path(".env")
    if env_path.exists():
        for line in env_path.read_text().splitlines():
            if line.strip().startswith("GEMINI_API_KEY"):
                _, _, raw_value = line.partition("=")
                value = raw_value.strip().strip("\"' ")
                if value:
                    os.environ["GEMINI_API_KEY"] = value
                    return value

    raise RuntimeError(
        "La variable de entorno GEMINI_API_KEY no está configurada. "
        "Define la clave en el entorno o en .env."
    )


def call_gemini(
    client: genai.Client,
    model: str,
    prompt: str,
    *,
    temperature: float = 0.4,
    top_p: float = 0.95,
    top_k: int = 32,
    max_output_tokens: int = 1536,
) -> str:
    """Invoke the Gemini API with retries and return the generated text."""
    config = genai_types.GenerateContentConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        max_output_tokens=max_output_tokens,
        response_mime_type="text/plain",
        response_modalities=["TEXT"],
    )

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = client.models.generate_content(
                model=model,
                contents=prompt,
                config=config,
            )
        except genai_errors.ClientError as exc:
            wait_time: Optional[float] = None
            if exc.code == 429:
                wait_time = RATE_LIMIT_BACKOFF_SECONDS * attempt
                print(
                    f"Advertencia: respuesta 429 (intento {attempt}/{MAX_RETRIES}): "
                    f"{exc.message}"
                )
            elif exc.code and exc.code >= 500:
                wait_time = PAUSE_SECONDS * attempt
                print(
                    f"Advertencia: error {exc.code} del modelo "
                    f"(intento {attempt}/{MAX_RETRIES}): {exc.message}"
                )
            else:
                raise GeminiError(
                    f"Error {exc.code or 'desconocido'} llamando al modelo: {exc}"
                ) from exc

            if wait_time is not None and attempt < MAX_RETRIES:
                time.sleep(wait_time)
                continue
            raise GeminiError(
                f"Falló la llamada al modelo tras reintentos: {exc}"
            ) from exc
        except genai_errors.ServerError as exc:
            wait_time = PAUSE_SECONDS * attempt
            print(
                f"Advertencia: error {exc.code} del servidor "
                f"(intento {attempt}/{MAX_RETRIES}): {exc.message}"
            )
            if attempt < MAX_RETRIES:
                time.sleep(wait_time)
                continue
            raise GeminiError(
                f"Falló la llamada al modelo tras reintentos: {exc}"
            ) from exc

        text = (response.text or "").strip()
        if text:
            return text

        parts = []
        for candidate in response.candidates or []:
            for part in candidate.content.parts:
                value = getattr(part, "text", None)
                if value:
                    parts.append(value)
        if parts:
            return "\n".join(parts).strip()

        raise GeminiError("La respuesta del modelo no contenía texto.")

    raise GeminiError("Se agotaron los reintentos con la API de Gemini.")


def build_chunk_prompt(chapter: Chapter, idx: int, total: int) -> str:
    return (
        "Eres un historiador y estratega político. Necesito un dossier extenso, "
        "rico en matices, que pueda ocupar aproximadamente una página impresa "
        "para cada capítulo de \"El príncipe\". Mantén un tono analítico, claro y "
        "elegante en español. Sigue esta estructura en cada respuesta:\n\n"
        "#### Panorama narrativo\n"
        "- Explica en 3-4 frases qué sucede en el capítulo y por qué importa.\n\n"
        "#### Tácticas y lecciones\n"
        "- Enumera 3-4 tácticas o ideas centrales con ejemplos del texto.\n\n"
        "#### Recursos memorables\n"
        "- Cita o parafrasea dos pasajes breves y comenta su significado.\n\n"
        "#### Resonancias actuales\n"
        "- Conecta las ideas con situaciones de liderazgo contemporáneo.\n\n"
        f"Capítulo {chapter.index}/{total}: {chapter.title}\n"
        "Transcripción base:\n"
        f"\"\"\"\n{chapter.body}\n\"\"\""
    )


def slugify_anchor(text: str) -> str:
    normalized = unicodedata.normalize("NFD", text)
    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
    slug = re.sub(r"[^a-z0-9]+", "-", ascii_text.lower()).strip("-")
    return slug or "capitulo"


def compose_markdown(chapters: List[Chapter], summaries: List[str]) -> str:
    toc_lines: List[str] = []
    for chapter in chapters:
        anchor_text = f"Capítulo {chapter.index}: {chapter.title}"
        anchor = slugify_anchor(anchor_text)
        toc_lines.append(
            f"{chapter.index}. [{anchor_text}](#{anchor})"
        )

    lines = [
        '# Dossier capítulo a capítulo de "El príncipe"',
        "> \"Es mejor ser temido que amado, si no se pueden tener ambas cosas.\"",
        "---",
        "## Tabla de contenidos",
        "\n".join(toc_lines),
        "",
    ]

    for chapter, summary in zip(chapters, summaries):
        anchor_text = f"Capítulo {chapter.index}: {chapter.title}"
        anchor = slugify_anchor(anchor_text)
        lines.extend(
            [
                f"<a name=\"{anchor}\"></a>",
                f"### {anchor_text}",
                summary.strip(),
                "",
            ]
        )

    return "\n".join(lines).strip()


def current_text_signature() -> Dict[str, float]:
    stats = TEXT_PATH.stat()
    return {"size": stats.st_size, "mtime": stats.st_mtime}


def load_chunk_cache(
    signature: Dict[str, float],
    chapters: List[Chapter],
) -> Dict[int, str]:
    if not CHUNK_CACHE_PATH.exists():
        return {}
    try:
        data = json.loads(CHUNK_CACHE_PATH.read_text(encoding="utf-8"))
    except json.JSONDecodeError:
        return {}
    if data.get("text_signature") != signature:
        return {}

    chapter_map = {chapter.index: chapter for chapter in chapters}
    cache: Dict[int, str] = {}
    for entry in data.get("chapters", []):
        idx = entry.get("index")
        summary = entry.get("summary")
        stored_hash = entry.get("hash")
        chapter = chapter_map.get(idx)
        if (
            chapter
            and isinstance(summary, str)
            and stored_hash == chapter.hash
        ):
            cache[idx] = summary
    return cache


def persist_chunk_cache(
    signature: Dict[str, float],
    chapters: List[Chapter],
    summaries: List[Optional[str]],
) -> None:
    entries = [
        {
            "index": chapter.index,
            "title": chapter.title,
            "hash": chapter.hash,
            "summary": summary,
        }
        for chapter, summary in zip(chapters, summaries)
        if summary is not None
    ]
    payload = {
        "text_signature": signature,
        "chapters": entries,
    }
    CHUNK_CACHE_PATH.write_text(
        json.dumps(payload, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )


def generate_summary() -> str:
    api_key = load_api_key()
    model = os.getenv("GEMINI_MODEL", DEFAULT_MODEL)
    client = genai.Client(api_key=api_key)
    text = TEXT_PATH.read_text(encoding="utf-8")
    chapters = split_into_chapters(text)
    if not chapters:
        raise GeminiError("No se pudieron detectar capítulos en el texto.")
    if len(chapters) != TARGET_PAGES:
        print(
            f"Advertencia: se detectaron {len(chapters)} capítulos, "
            f"pero se esperaban {TARGET_PAGES}."
        )

    signature = current_text_signature()
    cached = load_chunk_cache(signature, chapters)

    chapter_summaries: List[Optional[str]] = [None] * len(chapters)
    for pos, chapter in enumerate(chapters, start=1):
        cached_summary = cached.get(chapter.index)
        if cached_summary:
            chapter_summaries[pos - 1] = cached_summary

    total = len(chapters)
    print(f"Procesando {total} capítulos con el modelo {model}...")

    for idx, chapter in enumerate(chapters, start=1):
        if chapter_summaries[idx - 1]:
            print(f"- Capítulo {chapter.index}/{total} restaurado desde cache.")
            continue

        prompt = build_chunk_prompt(chapter, idx, total)
        summary = call_gemini(client, model, prompt, max_output_tokens=1536)
        chapter_summaries[idx - 1] = summary
        persist_chunk_cache(signature, chapters, chapter_summaries)
        print(
            f"- Capítulo {chapter.index}/{total} sintetizado "
            f"({len(chapter.body)} caracteres)."
        )
        time.sleep(PAUSE_SECONDS)

    completed_summaries = cast(
        List[str],
        [s for s in chapter_summaries if s is not None],
    )
    if len(completed_summaries) != total:
        raise GeminiError("No se completaron todos los fragmentos.")

    final_markdown = compose_markdown(chapters, completed_summaries)
    MD_OUTPUT.write_text(final_markdown.strip() + "\n", encoding="utf-8")
    return final_markdown


def main() -> None:
    final_markdown = generate_summary()
    print(f"\nResumen Markdown guardado en: {MD_OUTPUT}")

    PDF_OUTPUT.unlink(missing_ok=True)  # remove archivo previo si existe
    command = (
        f"pandoc \"{MD_OUTPUT}\" --standalone --pdf-engine=xelatex "
        f"-o \"{PDF_OUTPUT}\""
    )
    result = os.system(command)
    if result != 0:
        raise RuntimeError("pandoc no pudo crear el PDF. Revisa que LaTeX esté instalado.")

    print(f"PDF generado en: {PDF_OUTPUT}")


if __name__ == "__main__":
    main()