- Implementación de detector híbrido (Whisper + Chat + Audio + VLM) - Sistema de detección de gameplay real vs hablando - Scene detection con FFmpeg - Soporte para RTX 3050 y RX 6800 XT - Guía completa en 6800xt.md para próxima IA - Scripts de filtrado visual y análisis de contexto - Pipeline automatizado de generación de videos
473 lines
15 KiB
Python
473 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Twitch Highlight Generator - CONTEXT AWARE VERSION
|
|
Captura contexto completo: setup + momento épico + reacción
|
|
"""
|
|
|
|
import argparse
|
|
import io
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
from pathlib import Path
|
|
from typing import List, Tuple, Dict
|
|
|
|
import cv2
|
|
import numpy as np
|
|
import torch
|
|
import torch.nn.functional as F
|
|
from openai import OpenAI
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ContextAwareDetector:
|
|
"""Detector que prioriza contexto y duración sobre cantidad."""
|
|
|
|
def __init__(self, device="cuda", api_key=None):
|
|
self.device = (
|
|
torch.device(device) if torch.cuda.is_available() else torch.device("cpu")
|
|
)
|
|
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
self.client = None
|
|
if self.api_key:
|
|
base_url = os.environ.get("OPENAI_BASE_URL", "https://api.minimax.io/v1")
|
|
self.client = OpenAI(base_url=base_url, api_key=self.api_key)
|
|
|
|
def transcribe_with_whisper(self, video_file, model_size="base"):
|
|
"""Transcribe el video usando Whisper."""
|
|
try:
|
|
import whisper
|
|
|
|
logger.info(f"Cargando Whisper ({model_size})...")
|
|
model = whisper.load_model(model_size, device=self.device)
|
|
|
|
logger.info(f"Transcribiendo {video_file}...")
|
|
result = model.transcribe(
|
|
video_file, language="es", word_timestamps=True, verbose=False
|
|
)
|
|
|
|
logger.info(
|
|
f"Transcripción completada: {len(result['segments'])} segmentos"
|
|
)
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error en Whisper: {e}")
|
|
return None
|
|
|
|
def detect_regions_of_interest(self, chat_data, transcription, skip_intro=455):
|
|
"""
|
|
Detecta REGIONES de interés (no solo picos) con contexto.
|
|
Ventanas de 30-45 segundos con alta actividad.
|
|
"""
|
|
logger.info("Detectando regiones de interés con contexto...")
|
|
|
|
# 1. Crear línea de tiempo de actividad
|
|
duration = (
|
|
max(int(c["content_offset_seconds"]) for c in chat_data["comments"]) + 1
|
|
)
|
|
activity = np.zeros(duration)
|
|
|
|
for comment in chat_data["comments"]:
|
|
sec = int(comment["content_offset_seconds"])
|
|
if sec >= skip_intro and sec < duration:
|
|
activity[sec] += 1
|
|
|
|
# 2. Suavizar para detectar regiones (ventana de 10s)
|
|
from scipy.ndimage import uniform_filter1d
|
|
|
|
activity_smooth = uniform_filter1d(activity, size=10, mode="constant")
|
|
|
|
# 3. Encontrar regiones sobre el percentil 70 (más inclusivo)
|
|
threshold = np.percentile(activity_smooth[activity_smooth > 0], 65)
|
|
logger.info(f"Umbral de actividad: {threshold:.1f} mensajes/s")
|
|
|
|
regions = []
|
|
in_region = False
|
|
region_start = 0
|
|
|
|
for i in range(skip_intro, len(activity_smooth)):
|
|
if activity_smooth[i] > threshold:
|
|
if not in_region:
|
|
region_start = i
|
|
in_region = True
|
|
else:
|
|
if in_region:
|
|
# Extender región 10s antes y 15s después para contexto
|
|
start = max(skip_intro, region_start - 10)
|
|
end = min(duration, i + 15)
|
|
if end - start >= 20: # Mínimo 20 segundos
|
|
regions.append((int(start), int(end)))
|
|
in_region = False
|
|
|
|
# Capturar última región
|
|
if in_region:
|
|
start = max(skip_intro, region_start - 10)
|
|
end = min(duration, len(activity_smooth) + 15)
|
|
if end - start >= 20:
|
|
regions.append((int(start), int(end)))
|
|
|
|
logger.info(f"Regiones de interés detectadas: {len(regions)}")
|
|
return regions
|
|
|
|
def score_regions_with_transcription(self, regions, transcription):
|
|
"""
|
|
Puntúa cada región basándose en la transcripción.
|
|
Busca: insultos, rage, muertes, kills, risas.
|
|
"""
|
|
if not transcription:
|
|
return [(s, e, 5.0) for s, e in regions] # Score neutral
|
|
|
|
keywords_scores = {
|
|
"rage": [
|
|
"puta",
|
|
"mierda",
|
|
"joder",
|
|
"hostia",
|
|
"retrasado",
|
|
"imbecil",
|
|
"tonto",
|
|
"idiota",
|
|
],
|
|
"death": [
|
|
"me mataron",
|
|
"me mori",
|
|
"muerto",
|
|
"mataron",
|
|
"kill",
|
|
"mate",
|
|
"murio",
|
|
],
|
|
"epic": [
|
|
"pentakill",
|
|
"baron",
|
|
"dragon",
|
|
"triple",
|
|
"quadra",
|
|
"ace",
|
|
"epico",
|
|
"god",
|
|
],
|
|
"laughter": ["jajaja", "jejeje", "jajajaja", "risa", "jaja"],
|
|
"frustration": [
|
|
"no puede ser",
|
|
"imposible",
|
|
"que dices",
|
|
"en serio",
|
|
"omg",
|
|
],
|
|
}
|
|
|
|
scored_regions = []
|
|
|
|
for start, end in regions:
|
|
score = 0
|
|
reasons = []
|
|
|
|
# Analizar segmentos en esta región
|
|
for seg in transcription.get("segments", []):
|
|
seg_start = seg["start"]
|
|
seg_end = seg["end"]
|
|
|
|
# Si el segmento está en la región
|
|
if seg_start >= start and seg_end <= end:
|
|
text = seg["text"].lower()
|
|
|
|
for category, words in keywords_scores.items():
|
|
for word in words:
|
|
if word in text:
|
|
if category == "rage":
|
|
score += 3
|
|
if "rage" not in reasons:
|
|
reasons.append("rage")
|
|
elif category == "epic":
|
|
score += 3
|
|
if "epic" not in reasons:
|
|
reasons.append("epic")
|
|
elif category == "laughter":
|
|
score += 2
|
|
if "laughter" not in reasons:
|
|
reasons.append("laughter")
|
|
else:
|
|
score += 1
|
|
break
|
|
|
|
# Normalizar score por duración
|
|
duration = end - start
|
|
normalized_score = score / (duration / 30) if duration > 0 else 0
|
|
|
|
scored_regions.append(
|
|
{
|
|
"start": start,
|
|
"end": end,
|
|
"duration": duration,
|
|
"score": normalized_score,
|
|
"raw_score": score,
|
|
"reasons": reasons,
|
|
}
|
|
)
|
|
|
|
# Ordenar por score
|
|
scored_regions.sort(key=lambda x: -x["score"])
|
|
|
|
logger.info(f"Regiones puntuadas: {len(scored_regions)}")
|
|
for i, r in enumerate(scored_regions[:5]):
|
|
logger.info(
|
|
f" {i + 1}. {r['start'] // 60}m{r['start'] % 60}s - Score: {r['score']:.1f} "
|
|
f"({', '.join(r['reasons']) if r['reasons'] else 'activity'})"
|
|
)
|
|
|
|
return scored_regions
|
|
|
|
def merge_close_regions(self, regions, min_gap=30):
|
|
"""
|
|
Fusiona regiones que estén a menos de min_gap segundos.
|
|
Esto crea clips más largos con mejor flujo.
|
|
"""
|
|
if not regions:
|
|
return []
|
|
|
|
merged = []
|
|
|
|
for region in regions:
|
|
if not merged:
|
|
merged.append(region)
|
|
else:
|
|
last = merged[-1]
|
|
# Si está cerca, fusionar
|
|
if region["start"] - last["end"] < min_gap:
|
|
last["end"] = max(last["end"], region["end"])
|
|
last["duration"] = last["end"] - last["start"]
|
|
# Promedio de scores ponderado por duración
|
|
total_dur = last["duration"] + region["duration"]
|
|
last["score"] = (
|
|
last["score"] * last["duration"]
|
|
+ region["score"] * region["duration"]
|
|
) / total_dur
|
|
last["reasons"] = list(set(last["reasons"] + region["reasons"]))
|
|
else:
|
|
merged.append(region)
|
|
|
|
logger.info(
|
|
f"Regiones fusionadas: {len(merged)} (de {len(regions)} originales)"
|
|
)
|
|
return merged
|
|
|
|
def extend_with_transcription_context(
|
|
self, regions, transcription, min_duration=30
|
|
):
|
|
"""
|
|
Extiende clips basándose en el contexto de la transcripción.
|
|
Si detecta que la conversación continúa interesante, extiende.
|
|
"""
|
|
extended = []
|
|
|
|
for region in regions:
|
|
start = region["start"]
|
|
end = region["end"]
|
|
|
|
if not transcription:
|
|
# Sin transcripción, extender simétricamente
|
|
start = max(0, start - 5)
|
|
end = end + 10
|
|
else:
|
|
# Buscar 5 segundos antes por setup
|
|
for seg in transcription.get("segments", []):
|
|
if seg["end"] <= start and start - seg["end"] <= 5:
|
|
text = seg["text"].lower()
|
|
# Si hay setup interesante, incluirlo
|
|
if any(
|
|
word in text
|
|
for word in ["va", "vamos", "ahi", "cuidado", "ojo"]
|
|
):
|
|
start = int(seg["start"])
|
|
break
|
|
|
|
# Buscar 10 segundos después por reacción continua
|
|
for seg in transcription.get("segments", []):
|
|
if seg["start"] >= end and seg["start"] - end <= 10:
|
|
text = seg["text"].lower()
|
|
# Si hay reacción continua, extender
|
|
if any(
|
|
word in text
|
|
for word in ["joder", "puta", "no", "madre", "dios"]
|
|
):
|
|
end = int(seg["end"])
|
|
break
|
|
|
|
# Asegurar duración mínima
|
|
if end - start < min_duration:
|
|
end = start + min_duration
|
|
|
|
extended.append(
|
|
{
|
|
"start": start,
|
|
"end": end,
|
|
"duration": end - start,
|
|
"score": region["score"],
|
|
"reasons": region["reasons"],
|
|
}
|
|
)
|
|
|
|
return extended
|
|
|
|
def detect(self, video_file, chat_file, skip_intro=455, use_whisper=True):
|
|
"""Pipeline completo de detección con contexto."""
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("DETECTOR CON CONTEXTO - FASE 1: ANÁLISIS")
|
|
logger.info("=" * 60)
|
|
|
|
# Cargar chat
|
|
with open(chat_file, "r") as f:
|
|
chat_data = json.load(f)
|
|
|
|
# Transcripción
|
|
transcription = None
|
|
if use_whisper:
|
|
transcription = self.transcribe_with_whisper(video_file)
|
|
|
|
# 1. Detectar regiones de interés
|
|
regions = self.detect_regions_of_interest(chat_data, transcription, skip_intro)
|
|
|
|
if not regions:
|
|
logger.warning("No se detectaron regiones")
|
|
return []
|
|
|
|
# 2. Puntuar con transcripción
|
|
logger.info("\nFASE 2: PUNTUACIÓN CON CONTEXTO")
|
|
scored_regions = self.score_regions_with_transcription(regions, transcription)
|
|
|
|
# 3. Tomar top 25 regiones
|
|
top_regions = scored_regions[:25]
|
|
|
|
# 4. Fusionar cercanas (gap mayor = menos fusiones, más clips)
|
|
logger.info("\nFASE 3: FUSIÓN DE REGIONES CERCANAS")
|
|
merged = self.merge_close_regions(top_regions, min_gap=45)
|
|
|
|
# 5. Extender con contexto
|
|
logger.info("\nFASE 4: EXTENSIÓN CON CONTEXTO")
|
|
extended = self.extend_with_transcription_context(
|
|
merged, transcription, min_duration=18
|
|
)
|
|
|
|
# 6. Tomar top 15 clips finales (más contenido)
|
|
extended.sort(key=lambda x: -x["score"])
|
|
final_clips = extended[:15]
|
|
final_clips.sort(key=lambda x: x["start"])
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("CLIPS FINALES CON CONTEXTO")
|
|
logger.info("=" * 60)
|
|
total_dur = 0
|
|
for i, clip in enumerate(final_clips, 1):
|
|
mins = clip["start"] // 60
|
|
secs = clip["start"] % 60
|
|
total_dur += clip["duration"]
|
|
logger.info(
|
|
f"{i:2d}. {mins:02d}:{secs:02d} - {clip['duration']}s "
|
|
f"[Score: {clip['score']:.1f}] {', '.join(clip['reasons']) if clip['reasons'] else 'activity'}"
|
|
)
|
|
|
|
logger.info(
|
|
f"\nTotal: {len(final_clips)} clips, {total_dur}s ({total_dur // 60}m {total_dur % 60}s)"
|
|
)
|
|
|
|
return [(c["start"], c["end"]) for c in final_clips]
|
|
|
|
|
|
def create_video(video_file, highlights, output_file, padding=2):
|
|
"""Crea video final."""
|
|
if not highlights:
|
|
logger.error("No hay highlights")
|
|
return
|
|
|
|
result = subprocess.run(
|
|
[
|
|
"ffprobe",
|
|
"-v",
|
|
"error",
|
|
"-show_entries",
|
|
"format=duration",
|
|
"-of",
|
|
"default=noprint_wrappers=1:nokey=1",
|
|
video_file,
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
duration = float(result.stdout.strip()) if result.stdout.strip() else 3600
|
|
|
|
concat_file = tempfile.mktemp(suffix=".txt")
|
|
|
|
with open(concat_file, "w") as f:
|
|
for start, end in highlights:
|
|
start_pad = max(0, start - padding)
|
|
end_pad = min(duration, end + padding)
|
|
f.write(f"file '{video_file}'\n")
|
|
f.write(f"inpoint {start_pad}\n")
|
|
f.write(f"outpoint {end_pad}\n")
|
|
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-f",
|
|
"concat",
|
|
"-safe",
|
|
"0",
|
|
"-i",
|
|
concat_file,
|
|
"-c",
|
|
"copy",
|
|
"-y",
|
|
output_file,
|
|
]
|
|
subprocess.run(cmd, capture_output=True)
|
|
Path(concat_file).unlink()
|
|
|
|
logger.info(f"Video generado: {output_file}")
|
|
|
|
|
|
def main():
|
|
import os
|
|
|
|
parser = argparse.ArgumentParser(description="Context-Aware Highlight Detector")
|
|
parser.add_argument("--video", required=True)
|
|
parser.add_argument("--chat", required=True)
|
|
parser.add_argument("--output", default="highlights_context.mp4")
|
|
parser.add_argument("--skip-intro", type=int, default=455)
|
|
parser.add_argument("--use-whisper", action="store_true")
|
|
|
|
args = parser.parse_args()
|
|
|
|
detector = ContextAwareDetector(
|
|
device="cuda" if torch.cuda.is_available() else "cpu"
|
|
)
|
|
|
|
highlights = detector.detect(
|
|
args.video, args.chat, skip_intro=args.skip_intro, use_whisper=args.use_whisper
|
|
)
|
|
|
|
# Guardar JSON
|
|
json_file = args.output.replace(".mp4", ".json")
|
|
with open(json_file, "w") as f:
|
|
json.dump(highlights, f)
|
|
|
|
# Generar video
|
|
create_video(args.video, highlights, args.output)
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print(f"COMPLETADO: {args.output}")
|
|
print(f"Clips: {len(highlights)}")
|
|
print(f"{'=' * 60}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|