- Implementación de detector híbrido (Whisper + Chat + Audio + VLM) - Sistema de detección de gameplay real vs hablando - Scene detection con FFmpeg - Soporte para RTX 3050 y RX 6800 XT - Guía completa en 6800xt.md para próxima IA - Scripts de filtrado visual y análisis de contexto - Pipeline automatizado de generación de videos
489 lines
14 KiB
Python
489 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Twitch Highlight Generator - UNIFIED VERSION
|
|
Combina detector GPU + video generator en un solo archivo.
|
|
|
|
Uso:
|
|
python3 highlight_generator.py --video stream.mp4 --chat chat.json --output highlights.mp4
|
|
"""
|
|
|
|
import argparse
|
|
import io
|
|
import json
|
|
import logging
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
import torch.nn.functional as F
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_device():
|
|
"""Obtiene el dispositivo (GPU o CPU)."""
|
|
if torch.cuda.is_available():
|
|
device = torch.device("cuda")
|
|
logger.info(f"GPU detectada: {torch.cuda.get_device_name(0)}")
|
|
logger.info(
|
|
f"Memoria GPU total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB"
|
|
)
|
|
return device
|
|
return torch.device("cpu")
|
|
|
|
|
|
def load_audio_to_gpu(video_file, device="cuda", target_sr=16000):
|
|
"""Carga audio del video a GPU usando ffmpeg + soundfile + PyTorch."""
|
|
import time
|
|
|
|
logger.info(f"Extrayendo audio de {video_file}...")
|
|
t0 = time.time()
|
|
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-i",
|
|
video_file,
|
|
"-vn",
|
|
"-acodec",
|
|
"pcm_s16le",
|
|
"-ar",
|
|
str(target_sr),
|
|
"-ac",
|
|
"1",
|
|
"-f",
|
|
"wav",
|
|
"pipe:1",
|
|
"-y",
|
|
"-threads",
|
|
"4",
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True)
|
|
logger.info(f"FFmpeg audio extraction: {time.time() - t0:.1f}s")
|
|
|
|
import soundfile as sf
|
|
|
|
waveform_np, sr = sf.read(io.BytesIO(result.stdout), dtype="float32")
|
|
logger.info(f"Audio decode: {time.time() - t0:.1f}s")
|
|
|
|
t1 = time.time()
|
|
waveform = torch.from_numpy(waveform_np).pin_memory().to(device, non_blocking=True)
|
|
|
|
waveform = (
|
|
waveform.unsqueeze(0)
|
|
if waveform.dim() == 1
|
|
else waveform.mean(dim=0, keepdim=True)
|
|
)
|
|
|
|
logger.info(f"CPU->GPU transfer: {time.time() - t1:.2f}s")
|
|
logger.info(f"Audio cargado: shape={waveform.shape}, SR={sr}")
|
|
logger.info(f"Rango de audio: [{waveform.min():.4f}, {waveform.max():.4f}]")
|
|
return waveform, sr
|
|
|
|
|
|
def detect_audio_peaks_gpu(
|
|
video_file, threshold=1.5, window_seconds=5, device="cuda", skip_intro=600
|
|
):
|
|
"""Detecta picos de audio usando GPU completamente."""
|
|
import time
|
|
|
|
waveform, sr = load_audio_to_gpu(video_file, device=device)
|
|
|
|
# Saltar intro
|
|
skip_samples = skip_intro * sr
|
|
if waveform.shape[-1] > skip_samples:
|
|
waveform = waveform[:, skip_samples:]
|
|
logger.info(f"Audio: saltados primeros {skip_intro}s ({skip_samples} samples)")
|
|
|
|
t0 = time.time()
|
|
frame_length = sr * window_seconds
|
|
hop_length = sr
|
|
|
|
waveform = waveform.squeeze(0)
|
|
waveform_cpu = waveform.cpu()
|
|
del waveform
|
|
torch.cuda.empty_cache()
|
|
|
|
total_samples = waveform_cpu.shape[-1]
|
|
num_frames = 1 + (total_samples - frame_length) // hop_length
|
|
|
|
chunk_frames = 5000
|
|
num_chunks = (num_frames + chunk_frames - 1) // chunk_frames
|
|
|
|
logger.info(f"Processing {num_frames} frames in {num_chunks} chunks...")
|
|
|
|
all_energies = []
|
|
chunk_times = []
|
|
|
|
for chunk_idx in range(num_chunks):
|
|
chunk_start = chunk_idx * chunk_frames
|
|
chunk_end = min((chunk_idx + 1) * chunk_frames, num_frames)
|
|
actual_frames = chunk_end - chunk_start
|
|
|
|
if actual_frames <= 0:
|
|
break
|
|
|
|
sample_start = chunk_start * hop_length
|
|
sample_end = sample_start + frame_length + (actual_frames - 1) * hop_length
|
|
|
|
if sample_end > total_samples:
|
|
padding_needed = sample_end - total_samples
|
|
chunk_waveform_np = F.pad(waveform_cpu[sample_start:], (0, padding_needed))
|
|
else:
|
|
chunk_waveform_np = waveform_cpu[sample_start:sample_end]
|
|
|
|
chunk_waveform = chunk_waveform_np.to(device)
|
|
|
|
if chunk_waveform.shape[-1] < frame_length:
|
|
del chunk_waveform
|
|
continue
|
|
|
|
windows = chunk_waveform.unfold(0, frame_length, hop_length)
|
|
|
|
ct = time.time()
|
|
energies = torch.sqrt(torch.mean(windows**2, dim=1))
|
|
window_fft = torch.fft.rfft(windows, n=windows.shape[1] // 4, dim=1)
|
|
spectral_centroid = torch.mean(torch.abs(window_fft), dim=1)
|
|
|
|
kernel = torch.ones(1, 1, 5, device=device) / 5
|
|
energies_reshaped = energies.unsqueeze(0).unsqueeze(0)
|
|
energies_smooth = F.conv1d(energies_reshaped, kernel, padding=2).squeeze()
|
|
|
|
chunk_time = time.time() - ct
|
|
chunk_times.append(chunk_time)
|
|
|
|
all_energies.append(energies.cpu())
|
|
|
|
del (
|
|
chunk_waveform,
|
|
windows,
|
|
energies,
|
|
window_fft,
|
|
spectral_centroid,
|
|
energies_smooth,
|
|
)
|
|
torch.cuda.empty_cache()
|
|
|
|
if chunk_idx < 3:
|
|
logger.info(
|
|
f"Chunk {chunk_idx + 1}/{num_chunks}: {actual_frames} frames, GPU time: {chunk_time:.2f}s"
|
|
)
|
|
|
|
logger.info(
|
|
f"GPU Processing: {time.time() - t0:.2f}s total, avg chunk: {sum(chunk_times) / len(chunk_times):.2f}s"
|
|
)
|
|
|
|
t1 = time.time()
|
|
all_energies_tensor = torch.cat(all_energies).to(device)
|
|
mean_e = torch.mean(all_energies_tensor)
|
|
std_e = torch.std(all_energies_tensor)
|
|
|
|
logger.info(f"Final stats (GPU): {time.time() - t1:.2f}s")
|
|
logger.info(f"Audio stats: media={mean_e:.4f}, std={std_e:.4f}")
|
|
|
|
t2 = time.time()
|
|
z_scores = (all_energies_tensor - mean_e) / (std_e + 1e-8)
|
|
peak_mask = z_scores > threshold
|
|
logger.info(f"Peak detection (GPU): {time.time() - t2:.2f}s")
|
|
|
|
audio_scores = {
|
|
i: z_scores[i].item() for i in range(len(z_scores)) if peak_mask[i].item()
|
|
}
|
|
|
|
logger.info(f"Picos de audio detectados: {len(audio_scores)}")
|
|
return audio_scores
|
|
|
|
|
|
def detect_chat_peaks_gpu(chat_data, threshold=1.5, device="cuda", skip_intro=600):
|
|
"""Analiza chat usando GPU para estadísticas."""
|
|
chat_times = {}
|
|
for comment in chat_data["comments"]:
|
|
second = int(comment["content_offset_seconds"])
|
|
if second >= skip_intro:
|
|
chat_times[second] = chat_times.get(second, 0) + 1
|
|
|
|
if not chat_times:
|
|
return {}, {}
|
|
|
|
chat_values = list(chat_times.values())
|
|
chat_tensor = torch.tensor(chat_values, dtype=torch.float32, device=device)
|
|
|
|
mean_c = torch.mean(chat_tensor)
|
|
std_c = torch.std(chat_tensor)
|
|
|
|
logger.info(f"Chat stats: media={mean_c:.1f}, std={std_c:.1f}")
|
|
|
|
z_scores = (chat_tensor - mean_c) / (std_c + 1e-8)
|
|
peak_mask = z_scores > threshold
|
|
|
|
chat_scores = {}
|
|
for i, (second, count) in enumerate(chat_times.items()):
|
|
if peak_mask[i].item():
|
|
chat_scores[second] = z_scores[i].item()
|
|
|
|
logger.info(f"Picos de chat: {len(chat_scores)}")
|
|
return chat_scores, chat_times
|
|
|
|
|
|
def combine_scores_gpu(
|
|
chat_scores,
|
|
audio_scores,
|
|
duration,
|
|
min_duration,
|
|
device="cuda",
|
|
window=3,
|
|
skip_intro=0,
|
|
):
|
|
"""Combina scores usando GPU."""
|
|
logger.info(f"Combinando scores (ventana={window}s, skip_intro={skip_intro}s)...")
|
|
|
|
chat_tensor = torch.zeros(duration, device=device)
|
|
for sec, score in chat_scores.items():
|
|
if sec < duration:
|
|
chat_tensor[sec] = score
|
|
|
|
audio_tensor = torch.zeros(duration, device=device)
|
|
for sec, score in audio_scores.items():
|
|
if sec < duration:
|
|
audio_tensor[sec] = score
|
|
|
|
kernel_size = window * 2 + 1
|
|
kernel = torch.ones(1, 1, kernel_size, device=device) / kernel_size
|
|
|
|
chat_reshaped = chat_tensor.unsqueeze(0).unsqueeze(0)
|
|
audio_reshaped = audio_tensor.unsqueeze(0).unsqueeze(0)
|
|
|
|
chat_smooth = F.conv1d(chat_reshaped, kernel, padding=window).squeeze()
|
|
audio_smooth = F.conv1d(audio_reshaped, kernel, padding=window).squeeze()
|
|
|
|
max_chat = chat_smooth.max()
|
|
max_audio = audio_smooth.max()
|
|
|
|
chat_normalized = chat_smooth / max_chat if max_chat > 0 else chat_smooth
|
|
audio_normalized = audio_smooth / max_audio if max_audio > 0 else audio_smooth
|
|
|
|
points = (chat_normalized > 0.25).float() + (audio_normalized > 0.25).float()
|
|
highlight_mask = points >= 1
|
|
|
|
highlight_indices = torch.where(highlight_mask)[0]
|
|
|
|
intervals = []
|
|
if len(highlight_indices) > 0:
|
|
start = highlight_indices[0].item()
|
|
prev = highlight_indices[0].item()
|
|
|
|
for idx in highlight_indices[1:]:
|
|
second = idx.item()
|
|
if second - prev > 1:
|
|
if prev - start >= min_duration:
|
|
intervals.append((int(start + skip_intro), int(prev + skip_intro)))
|
|
start = second
|
|
prev = second
|
|
|
|
if prev - start >= min_duration:
|
|
intervals.append((int(start + skip_intro), int(prev + skip_intro)))
|
|
|
|
return intervals
|
|
|
|
|
|
def create_summary_video(video_file, highlights, output_file, padding=3):
|
|
"""Crea video resumen usando ffmpeg."""
|
|
if not highlights:
|
|
print("No hay highlights")
|
|
return
|
|
|
|
highlights = [(s, e) for s, e in highlights if e - s >= 5]
|
|
|
|
print(f"Creando video con {len(highlights)} highlights...")
|
|
|
|
result = subprocess.run(
|
|
[
|
|
"ffprobe",
|
|
"-v",
|
|
"error",
|
|
"-show_entries",
|
|
"format=duration",
|
|
"-of",
|
|
"default=noprint_wrappers=1:nokey=1",
|
|
video_file,
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
duration = float(result.stdout.strip()) if result.stdout.strip() else 3600
|
|
|
|
clips = []
|
|
for start, end in highlights:
|
|
start_pad = max(0, start - padding)
|
|
end_pad = min(duration, end + padding)
|
|
clips.append((start_pad, end_pad))
|
|
print(
|
|
f" Clip: {start_pad:.1f}s - {end_pad:.1f}s (duración: {end_pad - start_pad:.1f}s)"
|
|
)
|
|
|
|
if not clips:
|
|
print("No se pudo crear ningún clip")
|
|
return
|
|
|
|
concat_file = "concat_list.txt"
|
|
total_duration = 0
|
|
|
|
with open(concat_file, "w") as f:
|
|
for start, end in clips:
|
|
clip_duration = end - start
|
|
total_duration += clip_duration
|
|
f.write(f"file '{video_file}'\n")
|
|
f.write(f"inpoint {start}\n")
|
|
f.write(f"outpoint {end}\n")
|
|
|
|
print(f"Exportando video ({len(clips)} clips, {total_duration:.1f}s total)...")
|
|
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-f",
|
|
"concat",
|
|
"-safe",
|
|
"0",
|
|
"-i",
|
|
concat_file,
|
|
"-c",
|
|
"copy",
|
|
"-y",
|
|
output_file,
|
|
]
|
|
|
|
subprocess.run(cmd, capture_output=True)
|
|
subprocess.run(["rm", "-f", concat_file])
|
|
|
|
print(f"¡Listo! Video guardado en: {output_file}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Twitch Highlight Generator - GPU Accelerated"
|
|
)
|
|
parser.add_argument("--video", required=True, help="Video file")
|
|
parser.add_argument("--chat", required=True, help="Chat JSON file")
|
|
parser.add_argument(
|
|
"--output", default="highlights_final.mp4", help="Output video file"
|
|
)
|
|
parser.add_argument(
|
|
"--threshold",
|
|
type=float,
|
|
default=1.0,
|
|
help="Threshold for peaks (default: 1.0)",
|
|
)
|
|
parser.add_argument(
|
|
"--min-duration",
|
|
type=int,
|
|
default=8,
|
|
help="Min highlight duration (default: 8s)",
|
|
)
|
|
parser.add_argument(
|
|
"--padding", type=int, default=3, help="Padding seconds (default: 3s)"
|
|
)
|
|
parser.add_argument(
|
|
"--skip-intro",
|
|
type=int,
|
|
default=570,
|
|
help="Skip intro seconds (default: 570s = 9.5min)",
|
|
)
|
|
parser.add_argument("--device", default="auto", help="Device: auto, cuda, cpu")
|
|
parser.add_argument(
|
|
"--json-only",
|
|
action="store_true",
|
|
help="Only generate JSON, skip video creation",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.device == "auto":
|
|
device = get_device()
|
|
else:
|
|
device = torch.device(args.device)
|
|
|
|
logger.info(f"Usando device: {device}")
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("FASE 1: DETECTANDO HIGHLIGHTS")
|
|
logger.info("=" * 60)
|
|
|
|
logger.info("Cargando chat...")
|
|
with open(args.chat, "r") as f:
|
|
chat_data = json.load(f)
|
|
|
|
logger.info(
|
|
f"Saltando intro: primeros {args.skip_intro}s (~{args.skip_intro // 60}min)"
|
|
)
|
|
|
|
chat_scores, _ = detect_chat_peaks_gpu(
|
|
chat_data, args.threshold, device=device, skip_intro=args.skip_intro
|
|
)
|
|
|
|
audio_scores = detect_audio_peaks_gpu(
|
|
args.video, args.threshold, device=device, skip_intro=args.skip_intro
|
|
)
|
|
|
|
result = subprocess.run(
|
|
[
|
|
"ffprobe",
|
|
"-v",
|
|
"error",
|
|
"-show_entries",
|
|
"format=duration",
|
|
"-of",
|
|
"default=noprint_wrappers=1:nokey=1",
|
|
args.video,
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
duration = int(float(result.stdout.strip())) if result.stdout.strip() else 3600
|
|
|
|
intervals = combine_scores_gpu(
|
|
chat_scores,
|
|
audio_scores,
|
|
duration,
|
|
args.min_duration,
|
|
device=device,
|
|
skip_intro=args.skip_intro,
|
|
)
|
|
|
|
logger.info(f"Highlights encontrados: {len(intervals)}")
|
|
|
|
json_file = args.output.replace(".mp4", ".json")
|
|
with open(json_file, "w") as f:
|
|
json.dump(intervals, f)
|
|
logger.info(f"Timestamps guardados en: {json_file}")
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print(f"HIGHLIGHTS DETECTADOS ({len(intervals)} total)")
|
|
print(f"{'=' * 60}")
|
|
for i, (s, e) in enumerate(intervals[:20]):
|
|
mins = s // 60
|
|
secs = s % 60
|
|
duration = e - s
|
|
print(f"{i + 1:2d}. {mins:02d}:{secs:02d} - {duration}s")
|
|
print(f"{'=' * 60}")
|
|
|
|
if args.json_only:
|
|
logger.info("Modo JSON-only: saltando generación de video")
|
|
return
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("FASE 2: GENERANDO VIDEO")
|
|
logger.info("=" * 60)
|
|
|
|
create_summary_video(args.video, intervals, args.output, padding=args.padding)
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("¡COMPLETADO!")
|
|
logger.info(f"Video: {args.output}")
|
|
logger.info(f"Timestamps: {json_file}")
|
|
logger.info("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|