Files
twitch-highlight-detector/test_gpu.py
renato97 00180d0b1c Sistema completo de detección de highlights con VLM y análisis de gameplay
- Implementación de detector híbrido (Whisper + Chat + Audio + VLM)
- Sistema de detección de gameplay real vs hablando
- Scene detection con FFmpeg
- Soporte para RTX 3050 y RX 6800 XT
- Guía completa en 6800xt.md para próxima IA
- Scripts de filtrado visual y análisis de contexto
- Pipeline automatizado de generación de videos
2026-02-19 17:38:14 +00:00

466 lines
15 KiB
Python
Executable File

#!/usr/bin/env python3
"""
GPU Performance Profiler for Twitch Highlight Detector
Measures actual GPU kernel execution time vs wall clock time to detect CPU bottlenecks.
Uses torch.cuda.Event() for precise GPU timing.
GPU efficiency < 50% indicates CPU bottlenecks (implicit transfers, numpy usage, etc.)
"""
import torch
import torch.nn.functional as F
import numpy as np
import time
from pathlib import Path
from typing import Dict, List, Tuple
import argparse
class GPUProfiler:
"""Profiles GPU operations with CUDA events."""
def __init__(self, device: torch.device):
self.device = device
self.results: Dict[str, Dict[str, float]] = {}
def profile_operation(self, name: str, func, *args, **kwargs) -> any:
"""
Profile an operation and record GPU vs wall clock time.
Returns:
Result of the function call
"""
if self.device.type == "cpu":
# CPU fallback - just measure wall time
start = time.perf_counter()
result = func(*args, **kwargs)
elapsed = time.perf_counter() - start
self.results[name] = {
"gpu_ms": elapsed * 1000,
"wall_ms": elapsed * 1000,
"efficiency": 100.0,
"device": "CPU"
}
return result
# Create CUDA events for precise timing
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
# Synchronize before starting
torch.cuda.synchronize()
# Start wall clock timer
wall_start = time.perf_counter()
# Record GPU start
start_event.record()
# Execute function
result = func(*args, **kwargs)
# Record GPU end
end_event.record()
# Synchronize to ensure GPU finished
torch.cuda.synchronize()
# End wall clock timer
wall_elapsed = (time.perf_counter() - wall_start) * 1000 # ms
# Get GPU elapsed time
gpu_elapsed = start_event.elapsed_time(end_event) # ms
# Calculate efficiency
efficiency = (gpu_elapsed / wall_elapsed * 100) if wall_elapsed > 0 else 0
self.results[name] = {
"gpu_ms": gpu_elapsed,
"wall_ms": wall_elapsed,
"efficiency": efficiency,
"device": "CUDA"
}
return result
def print_results(self):
"""Print profiling results in a formatted table."""
print("\n" + "=" * 100)
print("GPU PROFILING RESULTS".center(100))
print("=" * 100)
print(f"{'Operation':<30} {'GPU Time':<15} {'Wall Time':<15} {'Efficiency':<15} {'Status':<15}")
print("-" * 100)
for name, metrics in self.results.items():
gpu_time = metrics["gpu_ms"]
wall_time = metrics["wall_ms"]
efficiency = metrics["efficiency"]
device = metrics["device"]
# Determine status
if device == "CPU":
status = "CPU ONLY"
elif efficiency < 50:
status = "CPU BOTTLENECK"
elif efficiency < 80:
status = "MIXED"
else:
status = "GPU OPTIMIZED"
print(
f"{name:<30} "
f"{gpu_time:>10.2f} ms "
f"{wall_time:>10.2f} ms "
f"{efficiency:>10.1f}% "
f"{status:<15}"
)
print("=" * 100)
# Print warnings for bottlenecks
for name, metrics in self.results.items():
if metrics["efficiency"] < 50 and metrics["device"] == "CUDA":
print(f"\nWARNING: '{name}' has GPU efficiency < 50% - likely CPU bottleneck!")
print(f" GPU time: {metrics['gpu_ms']:.2f} ms, Wall time: {metrics['wall_time']:.2f} ms")
print(f" Missing GPU time: {metrics['wall_ms'] - metrics['gpu_ms']:.2f} ms")
def test_tensor_operations(device: torch.device, profiler: GPUProfiler):
"""Test basic tensor operations on GPU."""
print("\n[1] Testing Basic Tensor Operations")
# Create test data (synchronously)
torch.cuda.synchronize() if device.type == "cuda" else None
data = torch.randn(1000000, device=device)
def test_sqrt():
return torch.sqrt(data ** 2)
def test_mean():
return torch.mean(data)
def test_std():
return torch.std(data)
profiler.profile_operation("sqrt(square)", test_sqrt)
profiler.profile_operation("mean", test_mean)
profiler.profile_operation("std", test_std)
def test_unfold_operation(device: torch.device, profiler: GPUProfiler):
"""Test unfold (sliding window) operation - key for audio processing."""
print("\n[2] Testing Unfold Operation (Sliding Windows)")
# Simulate audio waveform (1 hour at 16kHz = 57.6M samples)
# Use smaller size for testing
samples = 16000 * 60 # 1 minute
waveform = torch.randn(samples, device=device)
frame_length = 16000 * 5 # 5 seconds
hop_length = 16000 # 1 second
def test_unfold():
# unfold creates sliding windows
return waveform.unfold(0, frame_length, hop_length)
profiler.profile_operation("unfold (sliding windows)", test_unfold)
def test_window_statistics(device: torch.device, profiler: GPUProfiler):
"""Test windowed RMS calculation (main audio processing operation)."""
print("\n[3] Testing Windowed RMS Calculation")
# Create windowed data (as in detector)
num_frames = 3600 # 1 hour worth of 1-second windows
frame_length = 80000 # 5 seconds at 16kHz
windows = torch.randn(num_frames, frame_length, device=device)
def test_rms():
return torch.sqrt(torch.mean(windows ** 2, dim=1))
profiler.profile_operation("RMS (windowed)", test_rms)
def test_zscore_detection(device: torch.device, profiler: GPUProfiler):
"""Test z-score peak detection."""
print("\n[4] Testing Z-Score Peak Detection")
energies = torch.randn(3600, device=device)
threshold = 1.5
def test_zscore():
mean_e = torch.mean(energies)
std_e = torch.std(energies)
z_scores = (energies - mean_e) / (std_e + 1e-8)
peak_mask = z_scores > threshold
return z_scores, peak_mask
profiler.profile_operation("z-score + peak detection", test_zscore)
def test_conv1d_smoothing(device: torch.device, profiler: GPUProfiler):
"""Test convolution for smoothing (used in score combination)."""
print("\n[5] Testing Conv1D Smoothing")
duration = 3600 # 1 hour
window = 3
kernel_size = window * 2 + 1
# Create sparse scores (like real highlight detection)
tensor = torch.zeros(duration, device=device)
indices = torch.randint(0, duration, (100,), device=device)
tensor[indices] = torch.randn(100, device=device)
def test_conv1d():
kernel = torch.ones(1, 1, kernel_size, device=device) / kernel_size
tensor_reshaped = tensor.unsqueeze(0).unsqueeze(0)
smoothed = F.conv1d(tensor_reshaped, kernel, padding=window).squeeze()
return smoothed
profiler.profile_operation("conv1d smoothing", test_conv1d)
def test_cpu_transfer_overhead(device: torch.device, profiler: GPUProfiler):
"""Test CPU-GPU transfer overhead."""
print("\n[6] Testing CPU-GPU Transfer Overhead")
if device.type == "cpu":
print(" Skipping (CPU device)")
return
# Create numpy array (like soundfile output)
data_np = np.random.randn(16000 * 60).astype(np.float32) # 1 minute audio
def test_transfer():
# This mimics load_audio_to_gpu
tensor = torch.from_numpy(data_np).pin_memory().to(device, non_blocking=True)
return tensor
profiler.profile_operation("numpy -> GPU transfer", test_transfer)
# Test item() transfer (GPU -> CPU)
gpu_tensor = torch.randn(1000, device=device)
def test_item_transfer():
# Mimics .item() calls in detector
return [gpu_tensor[i].item() for i in range(100)]
profiler.profile_operation("GPU -> CPU item() x100", test_item_transfer)
def test_numpy_fallback_detection(device: torch.device, profiler: GPUProfiler):
"""Detect if operations are falling back to CPU."""
print("\n[7] Testing for Implicit CPU Transfers")
if device.type == "cpu":
print(" Skipping (CPU device)")
return
# Test: operations that might implicitly transfer to CPU
gpu_tensor = torch.randn(10000, device=device)
def test_numpy_conversion():
# This should cause implicit transfer
result = gpu_tensor.cpu().numpy()
return result
profiler.profile_operation("GPU tensor -> numpy (BAD)", test_numpy_conversion)
# Test: proper GPU-only path
def test_gpu_only():
result = torch.sqrt(gpu_tensor ** 2)
return result
profiler.profile_operation("GPU tensor operations (GOOD)", test_gpu_only)
def simulate_full_pipeline(device: torch.device, profiler: GPUProfiler):
"""Simulate the full audio detection pipeline."""
print("\n[8] Simulating Full Audio Pipeline")
# Simulate 1 hour of audio
sr = 16000
duration_seconds = 3600 # 1 hour
samples = sr * duration_seconds
waveform = torch.randn(samples, device=device)
window_seconds = 5
hop_length = sr
frame_length = sr * window_seconds
def full_pipeline():
# Step 1: Pad
num_frames = 1 + (waveform.shape[-1] - frame_length) // hop_length
padding_needed = num_frames * hop_length + frame_length - waveform.shape[-1]
if padding_needed > 0:
waveform_padded = F.pad(waveform, (0, padding_needed))
else:
waveform_padded = waveform
# Step 2: Unfold
windows = waveform_padded.unfold(0, frame_length, hop_length)
# Step 3: RMS
energies = torch.sqrt(torch.mean(windows ** 2, dim=1))
# Step 4: Stats
mean_e = torch.mean(energies)
std_e = torch.std(energies)
# Step 5: Z-score
z_scores = (energies - mean_e) / (std_e + 1e-8)
peak_mask = z_scores > 1.5
return z_scores, peak_mask
profiler.profile_operation("FULL AUDIO PIPELINE", full_pipeline)
def check_tensor_device_location(tensor: torch.Tensor, name: str):
"""Verify where a tensor is actually stored."""
device_str = str(tensor.device)
is_pinned = tensor.is_pinned()
print(f" {name}:")
print(f" Device: {device_str}")
print(f" Pin memory: {is_pinned}")
print(f" Shape: {tensor.shape}")
print(f" Dtype: {tensor.dtype}")
def verify_gpu_allocation():
"""Check GPU memory allocation."""
if not torch.cuda.is_available():
print("\nCUDA is not available. Running CPU-only tests.")
return False
print(f"\nGPU Information:")
print(f" Device: {torch.cuda.get_device_name(0)}")
print(f" Compute Capability: {torch.cuda.get_device_capability(0)}")
print(f" Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
print(f" Current Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
print(f" Current Memory Cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
return True
def main():
parser = argparse.ArgumentParser(
description="Profile GPU usage in Twitch Highlight Detector"
)
parser.add_argument(
"--device",
choices=["auto", "cuda", "cpu"],
default="auto",
help="Device to use (default: auto)"
)
parser.add_argument(
"--comprehensive",
action="store_true",
help="Run comprehensive tests including CPU transfer overhead"
)
args = parser.parse_args()
# Setup device
if args.device == "auto":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
device = torch.device(args.device)
print("=" * 100)
print("PYTORCH CUDA GPU PROFILER".center(100))
print("=" * 100)
print(f"\nTarget Device: {device}")
has_gpu = verify_gpu_allocation()
print(f"\nGPU Available: {has_gpu}")
# Create profiler
profiler = GPUProfiler(device)
# Run tests
print("\n" + "=" * 100)
print("RUNNING GPU UTILIZATION TESTS".center(100))
print("=" * 100)
test_tensor_operations(device, profiler)
test_unfold_operation(device, profiler)
test_window_statistics(device, profiler)
test_zscore_detection(device, profiler)
test_conv1d_smoothing(device, profiler)
simulate_full_pipeline(device, profiler)
if args.comprehensive and has_gpu:
test_cpu_transfer_overhead(device, profiler)
test_numpy_fallback_detection(device, profiler)
# Print results
profiler.print_results()
# Analysis and recommendations
print("\n" + "=" * 100)
print("ANALYSIS AND RECOMMENDATIONS".center(100))
print("=" * 100)
compute_ops = [name for name in profiler.results
if name not in ["numpy -> GPU transfer", "GPU -> CPU item() x100",
"GPU tensor -> numpy (BAD)"]]
if compute_ops:
avg_efficiency = np.mean([profiler.results[op]["efficiency"] for op in compute_ops])
print(f"\nAverage GPU Efficiency for Compute Operations: {avg_efficiency:.1f}%")
if avg_efficiency >= 80:
print(" Status: EXCELLENT - Code is well-optimized for GPU")
elif avg_efficiency >= 50:
print(" Status: GOOD - Some CPU overhead, but acceptable")
else:
print(" Status: POOR - Significant CPU bottlenecks detected")
print("\n Recommendations:")
print(" 1. Check for implicit CPU transfers in hot paths")
print(" 2. Minimize .item() and .cpu() calls")
print(" 3. Avoid numpy/scipy in GPU code")
print(" 4. Use pin_memory=True for data loading")
print(" 5. Batch operations to reduce kernel launch overhead")
# Check specific issues
if has_gpu:
print("\n" + "-" * 100)
print("SPECIFIC ISSUES DETECTED:")
print("-" * 100)
# Check for CPU bottlenecks
bottlenecks = [
(name, metrics)
for name, metrics in profiler.results.items()
if metrics["device"] == "CUDA" and metrics["efficiency"] < 50
]
if bottlenecks:
print("\nCPU BOTTLENECKS (efficiency < 50%):")
for name, metrics in bottlenecks:
print(f" - {name}: {metrics['efficiency']:.1f}%")
print(f" Missing GPU time: {metrics['wall_ms'] - metrics['gpu_ms']:.2f} ms")
else:
print("\nNo CPU bottlenecks detected in compute operations!")
# Check for transfer overhead
if "numpy -> GPU transfer" in profiler.results:
transfer_ms = profiler.results["numpy -> GPU transfer"]["gpu_ms"]
print(f"\nCPU->GPU Transfer Overhead: {transfer_ms:.2f} ms for 1 minute audio")
print(f" Recommendation: Use streaming or chunked loading for long audio")
if "GPU -> CPU item() x100" in profiler.results:
item_ms = profiler.results["GPU -> CPU item() x100"]["gpu_ms"]
print(f"\nGPU->CPU Transfer (item() x100): {item_ms:.2f} ms")
print(f" Per-item cost: {item_ms/100:.4f} ms")
print(f" Recommendation: Batch results and transfer once")
print("\n" + "=" * 100)
print("Test complete!".center(100))
print("=" * 100 + "\n")
if __name__ == "__main__":
main()