#!/usr/bin/env python3 """ Twitch Highlight Generator - UNIFIED VERSION Combina detector GPU + video generator en un solo archivo. Uso: python3 highlight_generator.py --video stream.mp4 --chat chat.json --output highlights.mp4 """ import argparse import io import json import logging import subprocess import sys from pathlib import Path import torch import torch.nn.functional as F logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") logger = logging.getLogger(__name__) def get_device(): """Obtiene el dispositivo (GPU o CPU).""" if torch.cuda.is_available(): device = torch.device("cuda") logger.info(f"GPU detectada: {torch.cuda.get_device_name(0)}") logger.info( f"Memoria GPU total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB" ) return device return torch.device("cpu") def load_audio_to_gpu(video_file, device="cuda", target_sr=16000): """Carga audio del video a GPU usando ffmpeg + soundfile + PyTorch.""" import time logger.info(f"Extrayendo audio de {video_file}...") t0 = time.time() cmd = [ "ffmpeg", "-i", video_file, "-vn", "-acodec", "pcm_s16le", "-ar", str(target_sr), "-ac", "1", "-f", "wav", "pipe:1", "-y", "-threads", "4", ] result = subprocess.run(cmd, capture_output=True) logger.info(f"FFmpeg audio extraction: {time.time() - t0:.1f}s") import soundfile as sf waveform_np, sr = sf.read(io.BytesIO(result.stdout), dtype="float32") logger.info(f"Audio decode: {time.time() - t0:.1f}s") t1 = time.time() waveform = torch.from_numpy(waveform_np).pin_memory().to(device, non_blocking=True) waveform = ( waveform.unsqueeze(0) if waveform.dim() == 1 else waveform.mean(dim=0, keepdim=True) ) logger.info(f"CPU->GPU transfer: {time.time() - t1:.2f}s") logger.info(f"Audio cargado: shape={waveform.shape}, SR={sr}") logger.info(f"Rango de audio: [{waveform.min():.4f}, {waveform.max():.4f}]") return waveform, sr def detect_audio_peaks_gpu( video_file, threshold=1.5, window_seconds=5, device="cuda", skip_intro=600 ): """Detecta picos de audio usando GPU completamente.""" import time waveform, sr = load_audio_to_gpu(video_file, device=device) # Saltar intro skip_samples = skip_intro * sr if waveform.shape[-1] > skip_samples: waveform = waveform[:, skip_samples:] logger.info(f"Audio: saltados primeros {skip_intro}s ({skip_samples} samples)") t0 = time.time() frame_length = sr * window_seconds hop_length = sr waveform = waveform.squeeze(0) waveform_cpu = waveform.cpu() del waveform torch.cuda.empty_cache() total_samples = waveform_cpu.shape[-1] num_frames = 1 + (total_samples - frame_length) // hop_length chunk_frames = 5000 num_chunks = (num_frames + chunk_frames - 1) // chunk_frames logger.info(f"Processing {num_frames} frames in {num_chunks} chunks...") all_energies = [] chunk_times = [] for chunk_idx in range(num_chunks): chunk_start = chunk_idx * chunk_frames chunk_end = min((chunk_idx + 1) * chunk_frames, num_frames) actual_frames = chunk_end - chunk_start if actual_frames <= 0: break sample_start = chunk_start * hop_length sample_end = sample_start + frame_length + (actual_frames - 1) * hop_length if sample_end > total_samples: padding_needed = sample_end - total_samples chunk_waveform_np = F.pad(waveform_cpu[sample_start:], (0, padding_needed)) else: chunk_waveform_np = waveform_cpu[sample_start:sample_end] chunk_waveform = chunk_waveform_np.to(device) if chunk_waveform.shape[-1] < frame_length: del chunk_waveform continue windows = chunk_waveform.unfold(0, frame_length, hop_length) ct = time.time() energies = torch.sqrt(torch.mean(windows**2, dim=1)) window_fft = torch.fft.rfft(windows, n=windows.shape[1] // 4, dim=1) spectral_centroid = torch.mean(torch.abs(window_fft), dim=1) kernel = torch.ones(1, 1, 5, device=device) / 5 energies_reshaped = energies.unsqueeze(0).unsqueeze(0) energies_smooth = F.conv1d(energies_reshaped, kernel, padding=2).squeeze() chunk_time = time.time() - ct chunk_times.append(chunk_time) all_energies.append(energies.cpu()) del ( chunk_waveform, windows, energies, window_fft, spectral_centroid, energies_smooth, ) torch.cuda.empty_cache() if chunk_idx < 3: logger.info( f"Chunk {chunk_idx + 1}/{num_chunks}: {actual_frames} frames, GPU time: {chunk_time:.2f}s" ) logger.info( f"GPU Processing: {time.time() - t0:.2f}s total, avg chunk: {sum(chunk_times) / len(chunk_times):.2f}s" ) t1 = time.time() all_energies_tensor = torch.cat(all_energies).to(device) mean_e = torch.mean(all_energies_tensor) std_e = torch.std(all_energies_tensor) logger.info(f"Final stats (GPU): {time.time() - t1:.2f}s") logger.info(f"Audio stats: media={mean_e:.4f}, std={std_e:.4f}") t2 = time.time() z_scores = (all_energies_tensor - mean_e) / (std_e + 1e-8) peak_mask = z_scores > threshold logger.info(f"Peak detection (GPU): {time.time() - t2:.2f}s") audio_scores = { i: z_scores[i].item() for i in range(len(z_scores)) if peak_mask[i].item() } logger.info(f"Picos de audio detectados: {len(audio_scores)}") return audio_scores def detect_chat_peaks_gpu(chat_data, threshold=1.5, device="cuda", skip_intro=600): """Analiza chat usando GPU para estadísticas.""" chat_times = {} for comment in chat_data["comments"]: second = int(comment["content_offset_seconds"]) if second >= skip_intro: chat_times[second] = chat_times.get(second, 0) + 1 if not chat_times: return {}, {} chat_values = list(chat_times.values()) chat_tensor = torch.tensor(chat_values, dtype=torch.float32, device=device) mean_c = torch.mean(chat_tensor) std_c = torch.std(chat_tensor) logger.info(f"Chat stats: media={mean_c:.1f}, std={std_c:.1f}") z_scores = (chat_tensor - mean_c) / (std_c + 1e-8) peak_mask = z_scores > threshold chat_scores = {} for i, (second, count) in enumerate(chat_times.items()): if peak_mask[i].item(): chat_scores[second] = z_scores[i].item() logger.info(f"Picos de chat: {len(chat_scores)}") return chat_scores, chat_times def combine_scores_gpu( chat_scores, audio_scores, duration, min_duration, device="cuda", window=3, skip_intro=0, ): """Combina scores usando GPU.""" logger.info(f"Combinando scores (ventana={window}s, skip_intro={skip_intro}s)...") chat_tensor = torch.zeros(duration, device=device) for sec, score in chat_scores.items(): if sec < duration: chat_tensor[sec] = score audio_tensor = torch.zeros(duration, device=device) for sec, score in audio_scores.items(): if sec < duration: audio_tensor[sec] = score kernel_size = window * 2 + 1 kernel = torch.ones(1, 1, kernel_size, device=device) / kernel_size chat_reshaped = chat_tensor.unsqueeze(0).unsqueeze(0) audio_reshaped = audio_tensor.unsqueeze(0).unsqueeze(0) chat_smooth = F.conv1d(chat_reshaped, kernel, padding=window).squeeze() audio_smooth = F.conv1d(audio_reshaped, kernel, padding=window).squeeze() max_chat = chat_smooth.max() max_audio = audio_smooth.max() chat_normalized = chat_smooth / max_chat if max_chat > 0 else chat_smooth audio_normalized = audio_smooth / max_audio if max_audio > 0 else audio_smooth points = (chat_normalized > 0.25).float() + (audio_normalized > 0.25).float() highlight_mask = points >= 1 highlight_indices = torch.where(highlight_mask)[0] intervals = [] if len(highlight_indices) > 0: start = highlight_indices[0].item() prev = highlight_indices[0].item() for idx in highlight_indices[1:]: second = idx.item() if second - prev > 1: if prev - start >= min_duration: intervals.append((int(start + skip_intro), int(prev + skip_intro))) start = second prev = second if prev - start >= min_duration: intervals.append((int(start + skip_intro), int(prev + skip_intro))) return intervals def create_summary_video(video_file, highlights, output_file, padding=3): """Crea video resumen usando ffmpeg.""" if not highlights: print("No hay highlights") return highlights = [(s, e) for s, e in highlights if e - s >= 5] print(f"Creando video con {len(highlights)} highlights...") result = subprocess.run( [ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", video_file, ], capture_output=True, text=True, ) duration = float(result.stdout.strip()) if result.stdout.strip() else 3600 clips = [] for start, end in highlights: start_pad = max(0, start - padding) end_pad = min(duration, end + padding) clips.append((start_pad, end_pad)) print( f" Clip: {start_pad:.1f}s - {end_pad:.1f}s (duración: {end_pad - start_pad:.1f}s)" ) if not clips: print("No se pudo crear ningún clip") return concat_file = "concat_list.txt" total_duration = 0 with open(concat_file, "w") as f: for start, end in clips: clip_duration = end - start total_duration += clip_duration f.write(f"file '{video_file}'\n") f.write(f"inpoint {start}\n") f.write(f"outpoint {end}\n") print(f"Exportando video ({len(clips)} clips, {total_duration:.1f}s total)...") cmd = [ "ffmpeg", "-f", "concat", "-safe", "0", "-i", concat_file, "-c", "copy", "-y", output_file, ] subprocess.run(cmd, capture_output=True) subprocess.run(["rm", "-f", concat_file]) print(f"¡Listo! Video guardado en: {output_file}") def main(): parser = argparse.ArgumentParser( description="Twitch Highlight Generator - GPU Accelerated" ) parser.add_argument("--video", required=True, help="Video file") parser.add_argument("--chat", required=True, help="Chat JSON file") parser.add_argument( "--output", default="highlights_final.mp4", help="Output video file" ) parser.add_argument( "--threshold", type=float, default=1.0, help="Threshold for peaks (default: 1.0)", ) parser.add_argument( "--min-duration", type=int, default=8, help="Min highlight duration (default: 8s)", ) parser.add_argument( "--padding", type=int, default=3, help="Padding seconds (default: 3s)" ) parser.add_argument( "--skip-intro", type=int, default=570, help="Skip intro seconds (default: 570s = 9.5min)", ) parser.add_argument("--device", default="auto", help="Device: auto, cuda, cpu") parser.add_argument( "--json-only", action="store_true", help="Only generate JSON, skip video creation", ) args = parser.parse_args() if args.device == "auto": device = get_device() else: device = torch.device(args.device) logger.info(f"Usando device: {device}") logger.info("=" * 60) logger.info("FASE 1: DETECTANDO HIGHLIGHTS") logger.info("=" * 60) logger.info("Cargando chat...") with open(args.chat, "r") as f: chat_data = json.load(f) logger.info( f"Saltando intro: primeros {args.skip_intro}s (~{args.skip_intro // 60}min)" ) chat_scores, _ = detect_chat_peaks_gpu( chat_data, args.threshold, device=device, skip_intro=args.skip_intro ) audio_scores = detect_audio_peaks_gpu( args.video, args.threshold, device=device, skip_intro=args.skip_intro ) result = subprocess.run( [ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", args.video, ], capture_output=True, text=True, ) duration = int(float(result.stdout.strip())) if result.stdout.strip() else 3600 intervals = combine_scores_gpu( chat_scores, audio_scores, duration, args.min_duration, device=device, skip_intro=args.skip_intro, ) logger.info(f"Highlights encontrados: {len(intervals)}") json_file = args.output.replace(".mp4", ".json") with open(json_file, "w") as f: json.dump(intervals, f) logger.info(f"Timestamps guardados en: {json_file}") print(f"\n{'=' * 60}") print(f"HIGHLIGHTS DETECTADOS ({len(intervals)} total)") print(f"{'=' * 60}") for i, (s, e) in enumerate(intervals[:20]): mins = s // 60 secs = s % 60 duration = e - s print(f"{i + 1:2d}. {mins:02d}:{secs:02d} - {duration}s") print(f"{'=' * 60}") if args.json_only: logger.info("Modo JSON-only: saltando generación de video") return logger.info("=" * 60) logger.info("FASE 2: GENERANDO VIDEO") logger.info("=" * 60) create_summary_video(args.video, intervals, args.output, padding=args.padding) logger.info("=" * 60) logger.info("¡COMPLETADO!") logger.info(f"Video: {args.output}") logger.info(f"Timestamps: {json_file}") logger.info("=" * 60) if __name__ == "__main__": main()