#!/usr/bin/env python3 """ Generate a multi-step summary of `el principe.txt` using Gemini, writing both Markdown and PDF outputs. The script chunks the source text to stay within API limits, requests partial summaries, and then asks the model for a polished, styled Markdown document that we later convert to PDF. """ from __future__ import annotations import hashlib import json import os import re import time import unicodedata from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional, Tuple, cast from google import genai from google.genai import errors as genai_errors from google.genai import types as genai_types TEXT_PATH = Path("el principe.txt") MD_OUTPUT = Path("resumen_el_principe.md") PDF_OUTPUT = Path("resumen_el_principe.pdf") CHUNK_CACHE_PATH = Path("chunk_summaries.json") PAUSE_SECONDS = 3.0 # small pause between successful calls RATE_LIMIT_BACKOFF_SECONDS = 20.0 # wait time (per attempt) after 429s MAX_RETRIES = 6 DEFAULT_MODEL = "gemini-2.0-flash" TARGET_PAGES = 26 # número de capítulos esperados @dataclass class Chapter: index: int title: str body: str hash: str TIMESTAMP_RE = re.compile(r"^\s*\[\d{2}:\d{2}:\d{2}\]\s*") def strip_timestamp(line: str) -> str: return TIMESTAMP_RE.sub("", line, count=1) ROMAN_VALUES = { "I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000, } def roman_to_int(value: str) -> Optional[int]: value = value.upper() total = 0 prev = 0 for char in reversed(value): if char not in ROMAN_VALUES: return None current = ROMAN_VALUES[char] if current < prev: total -= current else: total += current prev = current return total def detect_chapter_heading(line: str) -> Optional[Tuple[int, str]]: clean = strip_timestamp(line).strip() if not clean: return None match = re.match( r"(?i)cap[ií]tulo\s+([ivxlcdm]+|\d+)(?:\s*[\.\-:]?\s*(.*))?", clean, ) if match: raw_label = match.group(1) rest = (match.group(2) or "").strip(" .:-") number = ( int(raw_label) if raw_label.isdigit() else roman_to_int(raw_label) ) if number: title = rest or f"Capítulo {number}" return number, title match = re.match(r"(\d{1,2})\.\s*([A-ZÁÉÍÓÚ].*)", clean) if match: number = int(match.group(1)) title = match.group(2).strip(" .:-") or f"Capítulo {number}" return number, title match = re.match(r"(?i)(?:puesto|p)\.?\s*(\d{1,2})\.\s*(.*)", clean) if match: number = int(match.group(1)) title = match.group(2).strip(" .:-") or f"Capítulo {number}" return number, title return None def split_into_chapters(text: str) -> List[Chapter]: lines = text.splitlines() chapters: List[Chapter] = [] current_number: Optional[int] = None current_title: str = "" current_lines: List[str] = [] for line in lines: heading = detect_chapter_heading(line) if heading: number, title = heading if current_number is not None and current_lines: body = "\n".join(current_lines).strip() if body: chapters.append( Chapter( index=current_number, title=current_title or f"Capítulo {current_number}", body=body, hash=hashlib.sha1(body.encode("utf-8")).hexdigest(), ) ) current_number = number current_title = title current_lines = [] elif current_number is not None: current_lines.append(strip_timestamp(line)) if current_number is not None and current_lines: body = "\n".join(current_lines).strip() chapters.append( Chapter( index=current_number, title=current_title or f"Capítulo {current_number}", body=body, hash=hashlib.sha1(body.encode("utf-8")).hexdigest(), ) ) chapters.sort(key=lambda ch: ch.index) return chapters class GeminiError(RuntimeError): """Raised when the Gemini API does not return usable text.""" def load_api_key() -> str: api_key = os.getenv("GEMINI_API_KEY") if api_key: return api_key env_path = Path(".env") if env_path.exists(): for line in env_path.read_text().splitlines(): if line.strip().startswith("GEMINI_API_KEY"): _, _, raw_value = line.partition("=") value = raw_value.strip().strip("\"' ") if value: os.environ["GEMINI_API_KEY"] = value return value raise RuntimeError( "La variable de entorno GEMINI_API_KEY no está configurada. " "Define la clave en el entorno o en .env." ) def call_gemini( client: genai.Client, model: str, prompt: str, *, temperature: float = 0.4, top_p: float = 0.95, top_k: int = 32, max_output_tokens: int = 1536, ) -> str: """Invoke the Gemini API with retries and return the generated text.""" config = genai_types.GenerateContentConfig( temperature=temperature, top_p=top_p, top_k=top_k, max_output_tokens=max_output_tokens, response_mime_type="text/plain", response_modalities=["TEXT"], ) for attempt in range(1, MAX_RETRIES + 1): try: response = client.models.generate_content( model=model, contents=prompt, config=config, ) except genai_errors.ClientError as exc: wait_time: Optional[float] = None if exc.code == 429: wait_time = RATE_LIMIT_BACKOFF_SECONDS * attempt print( f"Advertencia: respuesta 429 (intento {attempt}/{MAX_RETRIES}): " f"{exc.message}" ) elif exc.code and exc.code >= 500: wait_time = PAUSE_SECONDS * attempt print( f"Advertencia: error {exc.code} del modelo " f"(intento {attempt}/{MAX_RETRIES}): {exc.message}" ) else: raise GeminiError( f"Error {exc.code or 'desconocido'} llamando al modelo: {exc}" ) from exc if wait_time is not None and attempt < MAX_RETRIES: time.sleep(wait_time) continue raise GeminiError( f"Falló la llamada al modelo tras reintentos: {exc}" ) from exc except genai_errors.ServerError as exc: wait_time = PAUSE_SECONDS * attempt print( f"Advertencia: error {exc.code} del servidor " f"(intento {attempt}/{MAX_RETRIES}): {exc.message}" ) if attempt < MAX_RETRIES: time.sleep(wait_time) continue raise GeminiError( f"Falló la llamada al modelo tras reintentos: {exc}" ) from exc text = (response.text or "").strip() if text: return text parts = [] for candidate in response.candidates or []: for part in candidate.content.parts: value = getattr(part, "text", None) if value: parts.append(value) if parts: return "\n".join(parts).strip() raise GeminiError("La respuesta del modelo no contenía texto.") raise GeminiError("Se agotaron los reintentos con la API de Gemini.") def build_chunk_prompt(chapter: Chapter, idx: int, total: int) -> str: return ( "Eres un historiador y estratega político. Necesito un dossier extenso, " "rico en matices, que pueda ocupar aproximadamente una página impresa " "para cada capítulo de \"El príncipe\". Mantén un tono analítico, claro y " "elegante en español. Sigue esta estructura en cada respuesta:\n\n" "#### Panorama narrativo\n" "- Explica en 3-4 frases qué sucede en el capítulo y por qué importa.\n\n" "#### Tácticas y lecciones\n" "- Enumera 3-4 tácticas o ideas centrales con ejemplos del texto.\n\n" "#### Recursos memorables\n" "- Cita o parafrasea dos pasajes breves y comenta su significado.\n\n" "#### Resonancias actuales\n" "- Conecta las ideas con situaciones de liderazgo contemporáneo.\n\n" f"Capítulo {chapter.index}/{total}: {chapter.title}\n" "Transcripción base:\n" f"\"\"\"\n{chapter.body}\n\"\"\"" ) def slugify_anchor(text: str) -> str: normalized = unicodedata.normalize("NFD", text) ascii_text = normalized.encode("ascii", "ignore").decode("ascii") slug = re.sub(r"[^a-z0-9]+", "-", ascii_text.lower()).strip("-") return slug or "capitulo" def compose_markdown(chapters: List[Chapter], summaries: List[str]) -> str: toc_lines: List[str] = [] for chapter in chapters: anchor_text = f"Capítulo {chapter.index}: {chapter.title}" anchor = slugify_anchor(anchor_text) toc_lines.append( f"{chapter.index}. [{anchor_text}](#{anchor})" ) lines = [ '# Dossier capítulo a capítulo de "El príncipe"', "> \"Es mejor ser temido que amado, si no se pueden tener ambas cosas.\"", "---", "## Tabla de contenidos", "\n".join(toc_lines), "", ] for chapter, summary in zip(chapters, summaries): anchor_text = f"Capítulo {chapter.index}: {chapter.title}" anchor = slugify_anchor(anchor_text) lines.extend( [ f"", f"### {anchor_text}", summary.strip(), "", ] ) return "\n".join(lines).strip() def current_text_signature() -> Dict[str, float]: stats = TEXT_PATH.stat() return {"size": stats.st_size, "mtime": stats.st_mtime} def load_chunk_cache( signature: Dict[str, float], chapters: List[Chapter], ) -> Dict[int, str]: if not CHUNK_CACHE_PATH.exists(): return {} try: data = json.loads(CHUNK_CACHE_PATH.read_text(encoding="utf-8")) except json.JSONDecodeError: return {} if data.get("text_signature") != signature: return {} chapter_map = {chapter.index: chapter for chapter in chapters} cache: Dict[int, str] = {} for entry in data.get("chapters", []): idx = entry.get("index") summary = entry.get("summary") stored_hash = entry.get("hash") chapter = chapter_map.get(idx) if ( chapter and isinstance(summary, str) and stored_hash == chapter.hash ): cache[idx] = summary return cache def persist_chunk_cache( signature: Dict[str, float], chapters: List[Chapter], summaries: List[Optional[str]], ) -> None: entries = [ { "index": chapter.index, "title": chapter.title, "hash": chapter.hash, "summary": summary, } for chapter, summary in zip(chapters, summaries) if summary is not None ] payload = { "text_signature": signature, "chapters": entries, } CHUNK_CACHE_PATH.write_text( json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8", ) def generate_summary() -> str: api_key = load_api_key() model = os.getenv("GEMINI_MODEL", DEFAULT_MODEL) client = genai.Client(api_key=api_key) text = TEXT_PATH.read_text(encoding="utf-8") chapters = split_into_chapters(text) if not chapters: raise GeminiError("No se pudieron detectar capítulos en el texto.") if len(chapters) != TARGET_PAGES: print( f"Advertencia: se detectaron {len(chapters)} capítulos, " f"pero se esperaban {TARGET_PAGES}." ) signature = current_text_signature() cached = load_chunk_cache(signature, chapters) chapter_summaries: List[Optional[str]] = [None] * len(chapters) for pos, chapter in enumerate(chapters, start=1): cached_summary = cached.get(chapter.index) if cached_summary: chapter_summaries[pos - 1] = cached_summary total = len(chapters) print(f"Procesando {total} capítulos con el modelo {model}...") for idx, chapter in enumerate(chapters, start=1): if chapter_summaries[idx - 1]: print(f"- Capítulo {chapter.index}/{total} restaurado desde cache.") continue prompt = build_chunk_prompt(chapter, idx, total) summary = call_gemini(client, model, prompt, max_output_tokens=1536) chapter_summaries[idx - 1] = summary persist_chunk_cache(signature, chapters, chapter_summaries) print( f"- Capítulo {chapter.index}/{total} sintetizado " f"({len(chapter.body)} caracteres)." ) time.sleep(PAUSE_SECONDS) completed_summaries = cast( List[str], [s for s in chapter_summaries if s is not None], ) if len(completed_summaries) != total: raise GeminiError("No se completaron todos los fragmentos.") final_markdown = compose_markdown(chapters, completed_summaries) MD_OUTPUT.write_text(final_markdown.strip() + "\n", encoding="utf-8") return final_markdown def main() -> None: final_markdown = generate_summary() print(f"\nResumen Markdown guardado en: {MD_OUTPUT}") PDF_OUTPUT.unlink(missing_ok=True) # remove archivo previo si existe command = ( f"pandoc \"{MD_OUTPUT}\" --standalone --pdf-engine=xelatex " f"-o \"{PDF_OUTPUT}\"" ) result = os.system(command) if result != 0: raise RuntimeError("pandoc no pudo crear el PDF. Revisa que LaTeX esté instalado.") print(f"PDF generado en: {PDF_OUTPUT}") if __name__ == "__main__": main()