#!/usr/bin/env python3 """ Twitch Highlight Generator - CONTEXT AWARE VERSION Captura contexto completo: setup + momento épico + reacción """ import argparse import io import json import logging import os import re import subprocess import sys import tempfile import time from pathlib import Path from typing import List, Tuple, Dict import cv2 import numpy as np import torch import torch.nn.functional as F from openai import OpenAI logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") logger = logging.getLogger(__name__) class ContextAwareDetector: """Detector que prioriza contexto y duración sobre cantidad.""" def __init__(self, device="cuda", api_key=None): self.device = ( torch.device(device) if torch.cuda.is_available() else torch.device("cpu") ) self.api_key = api_key or os.environ.get("OPENAI_API_KEY") self.client = None if self.api_key: base_url = os.environ.get("OPENAI_BASE_URL", "https://api.minimax.io/v1") self.client = OpenAI(base_url=base_url, api_key=self.api_key) def transcribe_with_whisper(self, video_file, model_size="base"): """Transcribe el video usando Whisper.""" try: import whisper logger.info(f"Cargando Whisper ({model_size})...") model = whisper.load_model(model_size, device=self.device) logger.info(f"Transcribiendo {video_file}...") result = model.transcribe( video_file, language="es", word_timestamps=True, verbose=False ) logger.info( f"Transcripción completada: {len(result['segments'])} segmentos" ) return result except Exception as e: logger.error(f"Error en Whisper: {e}") return None def detect_regions_of_interest(self, chat_data, transcription, skip_intro=455): """ Detecta REGIONES de interés (no solo picos) con contexto. Ventanas de 30-45 segundos con alta actividad. """ logger.info("Detectando regiones de interés con contexto...") # 1. Crear línea de tiempo de actividad duration = ( max(int(c["content_offset_seconds"]) for c in chat_data["comments"]) + 1 ) activity = np.zeros(duration) for comment in chat_data["comments"]: sec = int(comment["content_offset_seconds"]) if sec >= skip_intro and sec < duration: activity[sec] += 1 # 2. Suavizar para detectar regiones (ventana de 10s) from scipy.ndimage import uniform_filter1d activity_smooth = uniform_filter1d(activity, size=10, mode="constant") # 3. Encontrar regiones sobre el percentil 70 (más inclusivo) threshold = np.percentile(activity_smooth[activity_smooth > 0], 65) logger.info(f"Umbral de actividad: {threshold:.1f} mensajes/s") regions = [] in_region = False region_start = 0 for i in range(skip_intro, len(activity_smooth)): if activity_smooth[i] > threshold: if not in_region: region_start = i in_region = True else: if in_region: # Extender región 10s antes y 15s después para contexto start = max(skip_intro, region_start - 10) end = min(duration, i + 15) if end - start >= 20: # Mínimo 20 segundos regions.append((int(start), int(end))) in_region = False # Capturar última región if in_region: start = max(skip_intro, region_start - 10) end = min(duration, len(activity_smooth) + 15) if end - start >= 20: regions.append((int(start), int(end))) logger.info(f"Regiones de interés detectadas: {len(regions)}") return regions def score_regions_with_transcription(self, regions, transcription): """ Puntúa cada región basándose en la transcripción. Busca: insultos, rage, muertes, kills, risas. """ if not transcription: return [(s, e, 5.0) for s, e in regions] # Score neutral keywords_scores = { "rage": [ "puta", "mierda", "joder", "hostia", "retrasado", "imbecil", "tonto", "idiota", ], "death": [ "me mataron", "me mori", "muerto", "mataron", "kill", "mate", "murio", ], "epic": [ "pentakill", "baron", "dragon", "triple", "quadra", "ace", "epico", "god", ], "laughter": ["jajaja", "jejeje", "jajajaja", "risa", "jaja"], "frustration": [ "no puede ser", "imposible", "que dices", "en serio", "omg", ], } scored_regions = [] for start, end in regions: score = 0 reasons = [] # Analizar segmentos en esta región for seg in transcription.get("segments", []): seg_start = seg["start"] seg_end = seg["end"] # Si el segmento está en la región if seg_start >= start and seg_end <= end: text = seg["text"].lower() for category, words in keywords_scores.items(): for word in words: if word in text: if category == "rage": score += 3 if "rage" not in reasons: reasons.append("rage") elif category == "epic": score += 3 if "epic" not in reasons: reasons.append("epic") elif category == "laughter": score += 2 if "laughter" not in reasons: reasons.append("laughter") else: score += 1 break # Normalizar score por duración duration = end - start normalized_score = score / (duration / 30) if duration > 0 else 0 scored_regions.append( { "start": start, "end": end, "duration": duration, "score": normalized_score, "raw_score": score, "reasons": reasons, } ) # Ordenar por score scored_regions.sort(key=lambda x: -x["score"]) logger.info(f"Regiones puntuadas: {len(scored_regions)}") for i, r in enumerate(scored_regions[:5]): logger.info( f" {i + 1}. {r['start'] // 60}m{r['start'] % 60}s - Score: {r['score']:.1f} " f"({', '.join(r['reasons']) if r['reasons'] else 'activity'})" ) return scored_regions def merge_close_regions(self, regions, min_gap=30): """ Fusiona regiones que estén a menos de min_gap segundos. Esto crea clips más largos con mejor flujo. """ if not regions: return [] merged = [] for region in regions: if not merged: merged.append(region) else: last = merged[-1] # Si está cerca, fusionar if region["start"] - last["end"] < min_gap: last["end"] = max(last["end"], region["end"]) last["duration"] = last["end"] - last["start"] # Promedio de scores ponderado por duración total_dur = last["duration"] + region["duration"] last["score"] = ( last["score"] * last["duration"] + region["score"] * region["duration"] ) / total_dur last["reasons"] = list(set(last["reasons"] + region["reasons"])) else: merged.append(region) logger.info( f"Regiones fusionadas: {len(merged)} (de {len(regions)} originales)" ) return merged def extend_with_transcription_context( self, regions, transcription, min_duration=30 ): """ Extiende clips basándose en el contexto de la transcripción. Si detecta que la conversación continúa interesante, extiende. """ extended = [] for region in regions: start = region["start"] end = region["end"] if not transcription: # Sin transcripción, extender simétricamente start = max(0, start - 5) end = end + 10 else: # Buscar 5 segundos antes por setup for seg in transcription.get("segments", []): if seg["end"] <= start and start - seg["end"] <= 5: text = seg["text"].lower() # Si hay setup interesante, incluirlo if any( word in text for word in ["va", "vamos", "ahi", "cuidado", "ojo"] ): start = int(seg["start"]) break # Buscar 10 segundos después por reacción continua for seg in transcription.get("segments", []): if seg["start"] >= end and seg["start"] - end <= 10: text = seg["text"].lower() # Si hay reacción continua, extender if any( word in text for word in ["joder", "puta", "no", "madre", "dios"] ): end = int(seg["end"]) break # Asegurar duración mínima if end - start < min_duration: end = start + min_duration extended.append( { "start": start, "end": end, "duration": end - start, "score": region["score"], "reasons": region["reasons"], } ) return extended def detect(self, video_file, chat_file, skip_intro=455, use_whisper=True): """Pipeline completo de detección con contexto.""" logger.info("=" * 60) logger.info("DETECTOR CON CONTEXTO - FASE 1: ANÁLISIS") logger.info("=" * 60) # Cargar chat with open(chat_file, "r") as f: chat_data = json.load(f) # Transcripción transcription = None if use_whisper: transcription = self.transcribe_with_whisper(video_file) # 1. Detectar regiones de interés regions = self.detect_regions_of_interest(chat_data, transcription, skip_intro) if not regions: logger.warning("No se detectaron regiones") return [] # 2. Puntuar con transcripción logger.info("\nFASE 2: PUNTUACIÓN CON CONTEXTO") scored_regions = self.score_regions_with_transcription(regions, transcription) # 3. Tomar top 25 regiones top_regions = scored_regions[:25] # 4. Fusionar cercanas (gap mayor = menos fusiones, más clips) logger.info("\nFASE 3: FUSIÓN DE REGIONES CERCANAS") merged = self.merge_close_regions(top_regions, min_gap=45) # 5. Extender con contexto logger.info("\nFASE 4: EXTENSIÓN CON CONTEXTO") extended = self.extend_with_transcription_context( merged, transcription, min_duration=18 ) # 6. Tomar top 15 clips finales (más contenido) extended.sort(key=lambda x: -x["score"]) final_clips = extended[:15] final_clips.sort(key=lambda x: x["start"]) logger.info("\n" + "=" * 60) logger.info("CLIPS FINALES CON CONTEXTO") logger.info("=" * 60) total_dur = 0 for i, clip in enumerate(final_clips, 1): mins = clip["start"] // 60 secs = clip["start"] % 60 total_dur += clip["duration"] logger.info( f"{i:2d}. {mins:02d}:{secs:02d} - {clip['duration']}s " f"[Score: {clip['score']:.1f}] {', '.join(clip['reasons']) if clip['reasons'] else 'activity'}" ) logger.info( f"\nTotal: {len(final_clips)} clips, {total_dur}s ({total_dur // 60}m {total_dur % 60}s)" ) return [(c["start"], c["end"]) for c in final_clips] def create_video(video_file, highlights, output_file, padding=2): """Crea video final.""" if not highlights: logger.error("No hay highlights") return result = subprocess.run( [ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", video_file, ], capture_output=True, text=True, ) duration = float(result.stdout.strip()) if result.stdout.strip() else 3600 concat_file = tempfile.mktemp(suffix=".txt") with open(concat_file, "w") as f: for start, end in highlights: start_pad = max(0, start - padding) end_pad = min(duration, end + padding) f.write(f"file '{video_file}'\n") f.write(f"inpoint {start_pad}\n") f.write(f"outpoint {end_pad}\n") cmd = [ "ffmpeg", "-f", "concat", "-safe", "0", "-i", concat_file, "-c", "copy", "-y", output_file, ] subprocess.run(cmd, capture_output=True) Path(concat_file).unlink() logger.info(f"Video generado: {output_file}") def main(): import os parser = argparse.ArgumentParser(description="Context-Aware Highlight Detector") parser.add_argument("--video", required=True) parser.add_argument("--chat", required=True) parser.add_argument("--output", default="highlights_context.mp4") parser.add_argument("--skip-intro", type=int, default=455) parser.add_argument("--use-whisper", action="store_true") args = parser.parse_args() detector = ContextAwareDetector( device="cuda" if torch.cuda.is_available() else "cpu" ) highlights = detector.detect( args.video, args.chat, skip_intro=args.skip_intro, use_whisper=args.use_whisper ) # Guardar JSON json_file = args.output.replace(".mp4", ".json") with open(json_file, "w") as f: json.dump(highlights, f) # Generar video create_video(args.video, highlights, args.output) print(f"\n{'=' * 60}") print(f"COMPLETADO: {args.output}") print(f"Clips: {len(highlights)}") print(f"{'=' * 60}") if __name__ == "__main__": main()