Sistema completo de detección de highlights con VLM y análisis de gameplay

- Implementación de detector híbrido (Whisper + Chat + Audio + VLM) - Sistema de detección de gameplay real vs hablando - Scene detection con FFmpeg - Soporte para RTX 3050 y RX 6800 XT - Guía completa en 6800xt.md para próxima IA - Scripts de filtrado visual y análisis de contexto - Pipeline automatizado de generación de videos
2026-02-19 17:38:14 +00:00
parent c1c66a7d9a
commit 00180d0b1c
45 changed files with 10636 additions and 260 deletions
--- a/context_detector.py
+++ b/context_detector.py
@@ -0,0 +1,472 @@
+#!/usr/bin/env python3
+"""
+Twitch Highlight Generator - CONTEXT AWARE VERSION
+Captura contexto completo: setup + momento épico + reacción
+"""
+
+import argparse
+import io
+import json
+import logging
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import List, Tuple, Dict
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from openai import OpenAI
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+
+class ContextAwareDetector:
+    """Detector que prioriza contexto y duración sobre cantidad."""
+
+    def __init__(self, device="cuda", api_key=None):
+        self.device = (
+            torch.device(device) if torch.cuda.is_available() else torch.device("cpu")
+        )
+        self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
+        self.client = None
+        if self.api_key:
+            base_url = os.environ.get("OPENAI_BASE_URL", "https://api.minimax.io/v1")
+            self.client = OpenAI(base_url=base_url, api_key=self.api_key)
+
+    def transcribe_with_whisper(self, video_file, model_size="base"):
+        """Transcribe el video usando Whisper."""
+        try:
+            import whisper
+
+            logger.info(f"Cargando Whisper ({model_size})...")
+            model = whisper.load_model(model_size, device=self.device)
+
+            logger.info(f"Transcribiendo {video_file}...")
+            result = model.transcribe(
+                video_file, language="es", word_timestamps=True, verbose=False
+            )
+
+            logger.info(
+                f"Transcripción completada: {len(result['segments'])} segmentos"
+            )
+            return result
+
+        except Exception as e:
+            logger.error(f"Error en Whisper: {e}")
+            return None
+
+    def detect_regions_of_interest(self, chat_data, transcription, skip_intro=455):
+        """
+        Detecta REGIONES de interés (no solo picos) con contexto.
+        Ventanas de 30-45 segundos con alta actividad.
+        """
+        logger.info("Detectando regiones de interés con contexto...")
+
+        # 1. Crear línea de tiempo de actividad
+        duration = (
+            max(int(c["content_offset_seconds"]) for c in chat_data["comments"]) + 1
+        )
+        activity = np.zeros(duration)
+
+        for comment in chat_data["comments"]:
+            sec = int(comment["content_offset_seconds"])
+            if sec >= skip_intro and sec < duration:
+                activity[sec] += 1
+
+        # 2. Suavizar para detectar regiones (ventana de 10s)
+        from scipy.ndimage import uniform_filter1d
+
+        activity_smooth = uniform_filter1d(activity, size=10, mode="constant")
+
+        # 3. Encontrar regiones sobre el percentil 70 (más inclusivo)
+        threshold = np.percentile(activity_smooth[activity_smooth > 0], 65)
+        logger.info(f"Umbral de actividad: {threshold:.1f} mensajes/s")
+
+        regions = []
+        in_region = False
+        region_start = 0
+
+        for i in range(skip_intro, len(activity_smooth)):
+            if activity_smooth[i] > threshold:
+                if not in_region:
+                    region_start = i
+                    in_region = True
+            else:
+                if in_region:
+                    # Extender región 10s antes y 15s después para contexto
+                    start = max(skip_intro, region_start - 10)
+                    end = min(duration, i + 15)
+                    if end - start >= 20:  # Mínimo 20 segundos
+                        regions.append((int(start), int(end)))
+                    in_region = False
+
+        # Capturar última región
+        if in_region:
+            start = max(skip_intro, region_start - 10)
+            end = min(duration, len(activity_smooth) + 15)
+            if end - start >= 20:
+                regions.append((int(start), int(end)))
+
+        logger.info(f"Regiones de interés detectadas: {len(regions)}")
+        return regions
+
+    def score_regions_with_transcription(self, regions, transcription):
+        """
+        Puntúa cada región basándose en la transcripción.
+        Busca: insultos, rage, muertes, kills, risas.
+        """
+        if not transcription:
+            return [(s, e, 5.0) for s, e in regions]  # Score neutral
+
+        keywords_scores = {
+            "rage": [
+                "puta",
+                "mierda",
+                "joder",
+                "hostia",
+                "retrasado",
+                "imbecil",
+                "tonto",
+                "idiota",
+            ],
+            "death": [
+                "me mataron",
+                "me mori",
+                "muerto",
+                "mataron",
+                "kill",
+                "mate",
+                "murio",
+            ],
+            "epic": [
+                "pentakill",
+                "baron",
+                "dragon",
+                "triple",
+                "quadra",
+                "ace",
+                "epico",
+                "god",
+            ],
+            "laughter": ["jajaja", "jejeje", "jajajaja", "risa", "jaja"],
+            "frustration": [
+                "no puede ser",
+                "imposible",
+                "que dices",
+                "en serio",
+                "omg",
+            ],
+        }
+
+        scored_regions = []
+
+        for start, end in regions:
+            score = 0
+            reasons = []
+
+            # Analizar segmentos en esta región
+            for seg in transcription.get("segments", []):
+                seg_start = seg["start"]
+                seg_end = seg["end"]
+
+                # Si el segmento está en la región
+                if seg_start >= start and seg_end <= end:
+                    text = seg["text"].lower()
+
+                    for category, words in keywords_scores.items():
+                        for word in words:
+                            if word in text:
+                                if category == "rage":
+                                    score += 3
+                                    if "rage" not in reasons:
+                                        reasons.append("rage")
+                                elif category == "epic":
+                                    score += 3
+                                    if "epic" not in reasons:
+                                        reasons.append("epic")
+                                elif category == "laughter":
+                                    score += 2
+                                    if "laughter" not in reasons:
+                                        reasons.append("laughter")
+                                else:
+                                    score += 1
+                                break
+
+            # Normalizar score por duración
+            duration = end - start
+            normalized_score = score / (duration / 30) if duration > 0 else 0
+
+            scored_regions.append(
+                {
+                    "start": start,
+                    "end": end,
+                    "duration": duration,
+                    "score": normalized_score,
+                    "raw_score": score,
+                    "reasons": reasons,
+                }
+            )
+
+        # Ordenar por score
+        scored_regions.sort(key=lambda x: -x["score"])
+
+        logger.info(f"Regiones puntuadas: {len(scored_regions)}")
+        for i, r in enumerate(scored_regions[:5]):
+            logger.info(
+                f"  {i + 1}. {r['start'] // 60}m{r['start'] % 60}s - Score: {r['score']:.1f} "
+                f"({', '.join(r['reasons']) if r['reasons'] else 'activity'})"
+            )
+
+        return scored_regions
+
+    def merge_close_regions(self, regions, min_gap=30):
+        """
+        Fusiona regiones que estén a menos de min_gap segundos.
+        Esto crea clips más largos con mejor flujo.
+        """
+        if not regions:
+            return []
+
+        merged = []
+
+        for region in regions:
+            if not merged:
+                merged.append(region)
+            else:
+                last = merged[-1]
+                # Si está cerca, fusionar
+                if region["start"] - last["end"] < min_gap:
+                    last["end"] = max(last["end"], region["end"])
+                    last["duration"] = last["end"] - last["start"]
+                    # Promedio de scores ponderado por duración
+                    total_dur = last["duration"] + region["duration"]
+                    last["score"] = (
+                        last["score"] * last["duration"]
+                        + region["score"] * region["duration"]
+                    ) / total_dur
+                    last["reasons"] = list(set(last["reasons"] + region["reasons"]))
+                else:
+                    merged.append(region)
+
+        logger.info(
+            f"Regiones fusionadas: {len(merged)} (de {len(regions)} originales)"
+        )
+        return merged
+
+    def extend_with_transcription_context(
+        self, regions, transcription, min_duration=30
+    ):
+        """
+        Extiende clips basándose en el contexto de la transcripción.
+        Si detecta que la conversación continúa interesante, extiende.
+        """
+        extended = []
+
+        for region in regions:
+            start = region["start"]
+            end = region["end"]
+
+            if not transcription:
+                # Sin transcripción, extender simétricamente
+                start = max(0, start - 5)
+                end = end + 10
+            else:
+                # Buscar 5 segundos antes por setup
+                for seg in transcription.get("segments", []):
+                    if seg["end"] <= start and start - seg["end"] <= 5:
+                        text = seg["text"].lower()
+                        # Si hay setup interesante, incluirlo
+                        if any(
+                            word in text
+                            for word in ["va", "vamos", "ahi", "cuidado", "ojo"]
+                        ):
+                            start = int(seg["start"])
+                            break
+
+                # Buscar 10 segundos después por reacción continua
+                for seg in transcription.get("segments", []):
+                    if seg["start"] >= end and seg["start"] - end <= 10:
+                        text = seg["text"].lower()
+                        # Si hay reacción continua, extender
+                        if any(
+                            word in text
+                            for word in ["joder", "puta", "no", "madre", "dios"]
+                        ):
+                            end = int(seg["end"])
+                            break
+
+            # Asegurar duración mínima
+            if end - start < min_duration:
+                end = start + min_duration
+
+            extended.append(
+                {
+                    "start": start,
+                    "end": end,
+                    "duration": end - start,
+                    "score": region["score"],
+                    "reasons": region["reasons"],
+                }
+            )
+
+        return extended
+
+    def detect(self, video_file, chat_file, skip_intro=455, use_whisper=True):
+        """Pipeline completo de detección con contexto."""
+
+        logger.info("=" * 60)
+        logger.info("DETECTOR CON CONTEXTO - FASE 1: ANÁLISIS")
+        logger.info("=" * 60)
+
+        # Cargar chat
+        with open(chat_file, "r") as f:
+            chat_data = json.load(f)
+
+        # Transcripción
+        transcription = None
+        if use_whisper:
+            transcription = self.transcribe_with_whisper(video_file)
+
+        # 1. Detectar regiones de interés
+        regions = self.detect_regions_of_interest(chat_data, transcription, skip_intro)
+
+        if not regions:
+            logger.warning("No se detectaron regiones")
+            return []
+
+        # 2. Puntuar con transcripción
+        logger.info("\nFASE 2: PUNTUACIÓN CON CONTEXTO")
+        scored_regions = self.score_regions_with_transcription(regions, transcription)
+
+        # 3. Tomar top 25 regiones
+        top_regions = scored_regions[:25]
+
+        # 4. Fusionar cercanas (gap mayor = menos fusiones, más clips)
+        logger.info("\nFASE 3: FUSIÓN DE REGIONES CERCANAS")
+        merged = self.merge_close_regions(top_regions, min_gap=45)
+
+        # 5. Extender con contexto
+        logger.info("\nFASE 4: EXTENSIÓN CON CONTEXTO")
+        extended = self.extend_with_transcription_context(
+            merged, transcription, min_duration=18
+        )
+
+        # 6. Tomar top 15 clips finales (más contenido)
+        extended.sort(key=lambda x: -x["score"])
+        final_clips = extended[:15]
+        final_clips.sort(key=lambda x: x["start"])
+
+        logger.info("\n" + "=" * 60)
+        logger.info("CLIPS FINALES CON CONTEXTO")
+        logger.info("=" * 60)
+        total_dur = 0
+        for i, clip in enumerate(final_clips, 1):
+            mins = clip["start"] // 60
+            secs = clip["start"] % 60
+            total_dur += clip["duration"]
+            logger.info(
+                f"{i:2d}. {mins:02d}:{secs:02d} - {clip['duration']}s "
+                f"[Score: {clip['score']:.1f}] {', '.join(clip['reasons']) if clip['reasons'] else 'activity'}"
+            )
+
+        logger.info(
+            f"\nTotal: {len(final_clips)} clips, {total_dur}s ({total_dur // 60}m {total_dur % 60}s)"
+        )
+
+        return [(c["start"], c["end"]) for c in final_clips]
+
+
+def create_video(video_file, highlights, output_file, padding=2):
+    """Crea video final."""
+    if not highlights:
+        logger.error("No hay highlights")
+        return
+
+    result = subprocess.run(
+        [
+            "ffprobe",
+            "-v",
+            "error",
+            "-show_entries",
+            "format=duration",
+            "-of",
+            "default=noprint_wrappers=1:nokey=1",
+            video_file,
+        ],
+        capture_output=True,
+        text=True,
+    )
+    duration = float(result.stdout.strip()) if result.stdout.strip() else 3600
+
+    concat_file = tempfile.mktemp(suffix=".txt")
+
+    with open(concat_file, "w") as f:
+        for start, end in highlights:
+            start_pad = max(0, start - padding)
+            end_pad = min(duration, end + padding)
+            f.write(f"file '{video_file}'\n")
+            f.write(f"inpoint {start_pad}\n")
+            f.write(f"outpoint {end_pad}\n")
+
+    cmd = [
+        "ffmpeg",
+        "-f",
+        "concat",
+        "-safe",
+        "0",
+        "-i",
+        concat_file,
+        "-c",
+        "copy",
+        "-y",
+        output_file,
+    ]
+    subprocess.run(cmd, capture_output=True)
+    Path(concat_file).unlink()
+
+    logger.info(f"Video generado: {output_file}")
+
+
+def main():
+    import os
+
+    parser = argparse.ArgumentParser(description="Context-Aware Highlight Detector")
+    parser.add_argument("--video", required=True)
+    parser.add_argument("--chat", required=True)
+    parser.add_argument("--output", default="highlights_context.mp4")
+    parser.add_argument("--skip-intro", type=int, default=455)
+    parser.add_argument("--use-whisper", action="store_true")
+
+    args = parser.parse_args()
+
+    detector = ContextAwareDetector(
+        device="cuda" if torch.cuda.is_available() else "cpu"
+    )
+
+    highlights = detector.detect(
+        args.video, args.chat, skip_intro=args.skip_intro, use_whisper=args.use_whisper
+    )
+
+    # Guardar JSON
+    json_file = args.output.replace(".mp4", ".json")
+    with open(json_file, "w") as f:
+        json.dump(highlights, f)
+
+    # Generar video
+    create_video(args.video, highlights, args.output)
+
+    print(f"\n{'=' * 60}")
+    print(f"COMPLETADO: {args.output}")
+    print(f"Clips: {len(highlights)}")
+    print(f"{'=' * 60}")
+
+
+if __name__ == "__main__":
+    main()