Files
twitch-highlight-detector/context_detector.py
renato97 00180d0b1c Sistema completo de detección de highlights con VLM y análisis de gameplay
- Implementación de detector híbrido (Whisper + Chat + Audio + VLM)
- Sistema de detección de gameplay real vs hablando
- Scene detection con FFmpeg
- Soporte para RTX 3050 y RX 6800 XT
- Guía completa en 6800xt.md para próxima IA
- Scripts de filtrado visual y análisis de contexto
- Pipeline automatizado de generación de videos
2026-02-19 17:38:14 +00:00

473 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Twitch Highlight Generator - CONTEXT AWARE VERSION
Captura contexto completo: setup + momento épico + reacción
"""
import argparse
import io
import json
import logging
import os
import re
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from typing import List, Tuple, Dict
import cv2
import numpy as np
import torch
import torch.nn.functional as F
from openai import OpenAI
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
class ContextAwareDetector:
"""Detector que prioriza contexto y duración sobre cantidad."""
def __init__(self, device="cuda", api_key=None):
self.device = (
torch.device(device) if torch.cuda.is_available() else torch.device("cpu")
)
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
self.client = None
if self.api_key:
base_url = os.environ.get("OPENAI_BASE_URL", "https://api.minimax.io/v1")
self.client = OpenAI(base_url=base_url, api_key=self.api_key)
def transcribe_with_whisper(self, video_file, model_size="base"):
"""Transcribe el video usando Whisper."""
try:
import whisper
logger.info(f"Cargando Whisper ({model_size})...")
model = whisper.load_model(model_size, device=self.device)
logger.info(f"Transcribiendo {video_file}...")
result = model.transcribe(
video_file, language="es", word_timestamps=True, verbose=False
)
logger.info(
f"Transcripción completada: {len(result['segments'])} segmentos"
)
return result
except Exception as e:
logger.error(f"Error en Whisper: {e}")
return None
def detect_regions_of_interest(self, chat_data, transcription, skip_intro=455):
"""
Detecta REGIONES de interés (no solo picos) con contexto.
Ventanas de 30-45 segundos con alta actividad.
"""
logger.info("Detectando regiones de interés con contexto...")
# 1. Crear línea de tiempo de actividad
duration = (
max(int(c["content_offset_seconds"]) for c in chat_data["comments"]) + 1
)
activity = np.zeros(duration)
for comment in chat_data["comments"]:
sec = int(comment["content_offset_seconds"])
if sec >= skip_intro and sec < duration:
activity[sec] += 1
# 2. Suavizar para detectar regiones (ventana de 10s)
from scipy.ndimage import uniform_filter1d
activity_smooth = uniform_filter1d(activity, size=10, mode="constant")
# 3. Encontrar regiones sobre el percentil 70 (más inclusivo)
threshold = np.percentile(activity_smooth[activity_smooth > 0], 65)
logger.info(f"Umbral de actividad: {threshold:.1f} mensajes/s")
regions = []
in_region = False
region_start = 0
for i in range(skip_intro, len(activity_smooth)):
if activity_smooth[i] > threshold:
if not in_region:
region_start = i
in_region = True
else:
if in_region:
# Extender región 10s antes y 15s después para contexto
start = max(skip_intro, region_start - 10)
end = min(duration, i + 15)
if end - start >= 20: # Mínimo 20 segundos
regions.append((int(start), int(end)))
in_region = False
# Capturar última región
if in_region:
start = max(skip_intro, region_start - 10)
end = min(duration, len(activity_smooth) + 15)
if end - start >= 20:
regions.append((int(start), int(end)))
logger.info(f"Regiones de interés detectadas: {len(regions)}")
return regions
def score_regions_with_transcription(self, regions, transcription):
"""
Puntúa cada región basándose en la transcripción.
Busca: insultos, rage, muertes, kills, risas.
"""
if not transcription:
return [(s, e, 5.0) for s, e in regions] # Score neutral
keywords_scores = {
"rage": [
"puta",
"mierda",
"joder",
"hostia",
"retrasado",
"imbecil",
"tonto",
"idiota",
],
"death": [
"me mataron",
"me mori",
"muerto",
"mataron",
"kill",
"mate",
"murio",
],
"epic": [
"pentakill",
"baron",
"dragon",
"triple",
"quadra",
"ace",
"epico",
"god",
],
"laughter": ["jajaja", "jejeje", "jajajaja", "risa", "jaja"],
"frustration": [
"no puede ser",
"imposible",
"que dices",
"en serio",
"omg",
],
}
scored_regions = []
for start, end in regions:
score = 0
reasons = []
# Analizar segmentos en esta región
for seg in transcription.get("segments", []):
seg_start = seg["start"]
seg_end = seg["end"]
# Si el segmento está en la región
if seg_start >= start and seg_end <= end:
text = seg["text"].lower()
for category, words in keywords_scores.items():
for word in words:
if word in text:
if category == "rage":
score += 3
if "rage" not in reasons:
reasons.append("rage")
elif category == "epic":
score += 3
if "epic" not in reasons:
reasons.append("epic")
elif category == "laughter":
score += 2
if "laughter" not in reasons:
reasons.append("laughter")
else:
score += 1
break
# Normalizar score por duración
duration = end - start
normalized_score = score / (duration / 30) if duration > 0 else 0
scored_regions.append(
{
"start": start,
"end": end,
"duration": duration,
"score": normalized_score,
"raw_score": score,
"reasons": reasons,
}
)
# Ordenar por score
scored_regions.sort(key=lambda x: -x["score"])
logger.info(f"Regiones puntuadas: {len(scored_regions)}")
for i, r in enumerate(scored_regions[:5]):
logger.info(
f" {i + 1}. {r['start'] // 60}m{r['start'] % 60}s - Score: {r['score']:.1f} "
f"({', '.join(r['reasons']) if r['reasons'] else 'activity'})"
)
return scored_regions
def merge_close_regions(self, regions, min_gap=30):
"""
Fusiona regiones que estén a menos de min_gap segundos.
Esto crea clips más largos con mejor flujo.
"""
if not regions:
return []
merged = []
for region in regions:
if not merged:
merged.append(region)
else:
last = merged[-1]
# Si está cerca, fusionar
if region["start"] - last["end"] < min_gap:
last["end"] = max(last["end"], region["end"])
last["duration"] = last["end"] - last["start"]
# Promedio de scores ponderado por duración
total_dur = last["duration"] + region["duration"]
last["score"] = (
last["score"] * last["duration"]
+ region["score"] * region["duration"]
) / total_dur
last["reasons"] = list(set(last["reasons"] + region["reasons"]))
else:
merged.append(region)
logger.info(
f"Regiones fusionadas: {len(merged)} (de {len(regions)} originales)"
)
return merged
def extend_with_transcription_context(
self, regions, transcription, min_duration=30
):
"""
Extiende clips basándose en el contexto de la transcripción.
Si detecta que la conversación continúa interesante, extiende.
"""
extended = []
for region in regions:
start = region["start"]
end = region["end"]
if not transcription:
# Sin transcripción, extender simétricamente
start = max(0, start - 5)
end = end + 10
else:
# Buscar 5 segundos antes por setup
for seg in transcription.get("segments", []):
if seg["end"] <= start and start - seg["end"] <= 5:
text = seg["text"].lower()
# Si hay setup interesante, incluirlo
if any(
word in text
for word in ["va", "vamos", "ahi", "cuidado", "ojo"]
):
start = int(seg["start"])
break
# Buscar 10 segundos después por reacción continua
for seg in transcription.get("segments", []):
if seg["start"] >= end and seg["start"] - end <= 10:
text = seg["text"].lower()
# Si hay reacción continua, extender
if any(
word in text
for word in ["joder", "puta", "no", "madre", "dios"]
):
end = int(seg["end"])
break
# Asegurar duración mínima
if end - start < min_duration:
end = start + min_duration
extended.append(
{
"start": start,
"end": end,
"duration": end - start,
"score": region["score"],
"reasons": region["reasons"],
}
)
return extended
def detect(self, video_file, chat_file, skip_intro=455, use_whisper=True):
"""Pipeline completo de detección con contexto."""
logger.info("=" * 60)
logger.info("DETECTOR CON CONTEXTO - FASE 1: ANÁLISIS")
logger.info("=" * 60)
# Cargar chat
with open(chat_file, "r") as f:
chat_data = json.load(f)
# Transcripción
transcription = None
if use_whisper:
transcription = self.transcribe_with_whisper(video_file)
# 1. Detectar regiones de interés
regions = self.detect_regions_of_interest(chat_data, transcription, skip_intro)
if not regions:
logger.warning("No se detectaron regiones")
return []
# 2. Puntuar con transcripción
logger.info("\nFASE 2: PUNTUACIÓN CON CONTEXTO")
scored_regions = self.score_regions_with_transcription(regions, transcription)
# 3. Tomar top 25 regiones
top_regions = scored_regions[:25]
# 4. Fusionar cercanas (gap mayor = menos fusiones, más clips)
logger.info("\nFASE 3: FUSIÓN DE REGIONES CERCANAS")
merged = self.merge_close_regions(top_regions, min_gap=45)
# 5. Extender con contexto
logger.info("\nFASE 4: EXTENSIÓN CON CONTEXTO")
extended = self.extend_with_transcription_context(
merged, transcription, min_duration=18
)
# 6. Tomar top 15 clips finales (más contenido)
extended.sort(key=lambda x: -x["score"])
final_clips = extended[:15]
final_clips.sort(key=lambda x: x["start"])
logger.info("\n" + "=" * 60)
logger.info("CLIPS FINALES CON CONTEXTO")
logger.info("=" * 60)
total_dur = 0
for i, clip in enumerate(final_clips, 1):
mins = clip["start"] // 60
secs = clip["start"] % 60
total_dur += clip["duration"]
logger.info(
f"{i:2d}. {mins:02d}:{secs:02d} - {clip['duration']}s "
f"[Score: {clip['score']:.1f}] {', '.join(clip['reasons']) if clip['reasons'] else 'activity'}"
)
logger.info(
f"\nTotal: {len(final_clips)} clips, {total_dur}s ({total_dur // 60}m {total_dur % 60}s)"
)
return [(c["start"], c["end"]) for c in final_clips]
def create_video(video_file, highlights, output_file, padding=2):
"""Crea video final."""
if not highlights:
logger.error("No hay highlights")
return
result = subprocess.run(
[
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
video_file,
],
capture_output=True,
text=True,
)
duration = float(result.stdout.strip()) if result.stdout.strip() else 3600
concat_file = tempfile.mktemp(suffix=".txt")
with open(concat_file, "w") as f:
for start, end in highlights:
start_pad = max(0, start - padding)
end_pad = min(duration, end + padding)
f.write(f"file '{video_file}'\n")
f.write(f"inpoint {start_pad}\n")
f.write(f"outpoint {end_pad}\n")
cmd = [
"ffmpeg",
"-f",
"concat",
"-safe",
"0",
"-i",
concat_file,
"-c",
"copy",
"-y",
output_file,
]
subprocess.run(cmd, capture_output=True)
Path(concat_file).unlink()
logger.info(f"Video generado: {output_file}")
def main():
import os
parser = argparse.ArgumentParser(description="Context-Aware Highlight Detector")
parser.add_argument("--video", required=True)
parser.add_argument("--chat", required=True)
parser.add_argument("--output", default="highlights_context.mp4")
parser.add_argument("--skip-intro", type=int, default=455)
parser.add_argument("--use-whisper", action="store_true")
args = parser.parse_args()
detector = ContextAwareDetector(
device="cuda" if torch.cuda.is_available() else "cpu"
)
highlights = detector.detect(
args.video, args.chat, skip_intro=args.skip_intro, use_whisper=args.use_whisper
)
# Guardar JSON
json_file = args.output.replace(".mp4", ".json")
with open(json_file, "w") as f:
json.dump(highlights, f)
# Generar video
create_video(args.video, highlights, args.output)
print(f"\n{'=' * 60}")
print(f"COMPLETADO: {args.output}")
print(f"Clips: {len(highlights)}")
print(f"{'=' * 60}")
if __name__ == "__main__":
main()