#!/usr/bin/env python3 """ GPU Performance Profiler for Twitch Highlight Detector Measures actual GPU kernel execution time vs wall clock time to detect CPU bottlenecks. Uses torch.cuda.Event() for precise GPU timing. GPU efficiency < 50% indicates CPU bottlenecks (implicit transfers, numpy usage, etc.) """ import torch import torch.nn.functional as F import numpy as np import time from pathlib import Path from typing import Dict, List, Tuple import argparse class GPUProfiler: """Profiles GPU operations with CUDA events.""" def __init__(self, device: torch.device): self.device = device self.results: Dict[str, Dict[str, float]] = {} def profile_operation(self, name: str, func, *args, **kwargs) -> any: """ Profile an operation and record GPU vs wall clock time. Returns: Result of the function call """ if self.device.type == "cpu": # CPU fallback - just measure wall time start = time.perf_counter() result = func(*args, **kwargs) elapsed = time.perf_counter() - start self.results[name] = { "gpu_ms": elapsed * 1000, "wall_ms": elapsed * 1000, "efficiency": 100.0, "device": "CPU" } return result # Create CUDA events for precise timing start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) # Synchronize before starting torch.cuda.synchronize() # Start wall clock timer wall_start = time.perf_counter() # Record GPU start start_event.record() # Execute function result = func(*args, **kwargs) # Record GPU end end_event.record() # Synchronize to ensure GPU finished torch.cuda.synchronize() # End wall clock timer wall_elapsed = (time.perf_counter() - wall_start) * 1000 # ms # Get GPU elapsed time gpu_elapsed = start_event.elapsed_time(end_event) # ms # Calculate efficiency efficiency = (gpu_elapsed / wall_elapsed * 100) if wall_elapsed > 0 else 0 self.results[name] = { "gpu_ms": gpu_elapsed, "wall_ms": wall_elapsed, "efficiency": efficiency, "device": "CUDA" } return result def print_results(self): """Print profiling results in a formatted table.""" print("\n" + "=" * 100) print("GPU PROFILING RESULTS".center(100)) print("=" * 100) print(f"{'Operation':<30} {'GPU Time':<15} {'Wall Time':<15} {'Efficiency':<15} {'Status':<15}") print("-" * 100) for name, metrics in self.results.items(): gpu_time = metrics["gpu_ms"] wall_time = metrics["wall_ms"] efficiency = metrics["efficiency"] device = metrics["device"] # Determine status if device == "CPU": status = "CPU ONLY" elif efficiency < 50: status = "CPU BOTTLENECK" elif efficiency < 80: status = "MIXED" else: status = "GPU OPTIMIZED" print( f"{name:<30} " f"{gpu_time:>10.2f} ms " f"{wall_time:>10.2f} ms " f"{efficiency:>10.1f}% " f"{status:<15}" ) print("=" * 100) # Print warnings for bottlenecks for name, metrics in self.results.items(): if metrics["efficiency"] < 50 and metrics["device"] == "CUDA": print(f"\nWARNING: '{name}' has GPU efficiency < 50% - likely CPU bottleneck!") print(f" GPU time: {metrics['gpu_ms']:.2f} ms, Wall time: {metrics['wall_time']:.2f} ms") print(f" Missing GPU time: {metrics['wall_ms'] - metrics['gpu_ms']:.2f} ms") def test_tensor_operations(device: torch.device, profiler: GPUProfiler): """Test basic tensor operations on GPU.""" print("\n[1] Testing Basic Tensor Operations") # Create test data (synchronously) torch.cuda.synchronize() if device.type == "cuda" else None data = torch.randn(1000000, device=device) def test_sqrt(): return torch.sqrt(data ** 2) def test_mean(): return torch.mean(data) def test_std(): return torch.std(data) profiler.profile_operation("sqrt(square)", test_sqrt) profiler.profile_operation("mean", test_mean) profiler.profile_operation("std", test_std) def test_unfold_operation(device: torch.device, profiler: GPUProfiler): """Test unfold (sliding window) operation - key for audio processing.""" print("\n[2] Testing Unfold Operation (Sliding Windows)") # Simulate audio waveform (1 hour at 16kHz = 57.6M samples) # Use smaller size for testing samples = 16000 * 60 # 1 minute waveform = torch.randn(samples, device=device) frame_length = 16000 * 5 # 5 seconds hop_length = 16000 # 1 second def test_unfold(): # unfold creates sliding windows return waveform.unfold(0, frame_length, hop_length) profiler.profile_operation("unfold (sliding windows)", test_unfold) def test_window_statistics(device: torch.device, profiler: GPUProfiler): """Test windowed RMS calculation (main audio processing operation).""" print("\n[3] Testing Windowed RMS Calculation") # Create windowed data (as in detector) num_frames = 3600 # 1 hour worth of 1-second windows frame_length = 80000 # 5 seconds at 16kHz windows = torch.randn(num_frames, frame_length, device=device) def test_rms(): return torch.sqrt(torch.mean(windows ** 2, dim=1)) profiler.profile_operation("RMS (windowed)", test_rms) def test_zscore_detection(device: torch.device, profiler: GPUProfiler): """Test z-score peak detection.""" print("\n[4] Testing Z-Score Peak Detection") energies = torch.randn(3600, device=device) threshold = 1.5 def test_zscore(): mean_e = torch.mean(energies) std_e = torch.std(energies) z_scores = (energies - mean_e) / (std_e + 1e-8) peak_mask = z_scores > threshold return z_scores, peak_mask profiler.profile_operation("z-score + peak detection", test_zscore) def test_conv1d_smoothing(device: torch.device, profiler: GPUProfiler): """Test convolution for smoothing (used in score combination).""" print("\n[5] Testing Conv1D Smoothing") duration = 3600 # 1 hour window = 3 kernel_size = window * 2 + 1 # Create sparse scores (like real highlight detection) tensor = torch.zeros(duration, device=device) indices = torch.randint(0, duration, (100,), device=device) tensor[indices] = torch.randn(100, device=device) def test_conv1d(): kernel = torch.ones(1, 1, kernel_size, device=device) / kernel_size tensor_reshaped = tensor.unsqueeze(0).unsqueeze(0) smoothed = F.conv1d(tensor_reshaped, kernel, padding=window).squeeze() return smoothed profiler.profile_operation("conv1d smoothing", test_conv1d) def test_cpu_transfer_overhead(device: torch.device, profiler: GPUProfiler): """Test CPU-GPU transfer overhead.""" print("\n[6] Testing CPU-GPU Transfer Overhead") if device.type == "cpu": print(" Skipping (CPU device)") return # Create numpy array (like soundfile output) data_np = np.random.randn(16000 * 60).astype(np.float32) # 1 minute audio def test_transfer(): # This mimics load_audio_to_gpu tensor = torch.from_numpy(data_np).pin_memory().to(device, non_blocking=True) return tensor profiler.profile_operation("numpy -> GPU transfer", test_transfer) # Test item() transfer (GPU -> CPU) gpu_tensor = torch.randn(1000, device=device) def test_item_transfer(): # Mimics .item() calls in detector return [gpu_tensor[i].item() for i in range(100)] profiler.profile_operation("GPU -> CPU item() x100", test_item_transfer) def test_numpy_fallback_detection(device: torch.device, profiler: GPUProfiler): """Detect if operations are falling back to CPU.""" print("\n[7] Testing for Implicit CPU Transfers") if device.type == "cpu": print(" Skipping (CPU device)") return # Test: operations that might implicitly transfer to CPU gpu_tensor = torch.randn(10000, device=device) def test_numpy_conversion(): # This should cause implicit transfer result = gpu_tensor.cpu().numpy() return result profiler.profile_operation("GPU tensor -> numpy (BAD)", test_numpy_conversion) # Test: proper GPU-only path def test_gpu_only(): result = torch.sqrt(gpu_tensor ** 2) return result profiler.profile_operation("GPU tensor operations (GOOD)", test_gpu_only) def simulate_full_pipeline(device: torch.device, profiler: GPUProfiler): """Simulate the full audio detection pipeline.""" print("\n[8] Simulating Full Audio Pipeline") # Simulate 1 hour of audio sr = 16000 duration_seconds = 3600 # 1 hour samples = sr * duration_seconds waveform = torch.randn(samples, device=device) window_seconds = 5 hop_length = sr frame_length = sr * window_seconds def full_pipeline(): # Step 1: Pad num_frames = 1 + (waveform.shape[-1] - frame_length) // hop_length padding_needed = num_frames * hop_length + frame_length - waveform.shape[-1] if padding_needed > 0: waveform_padded = F.pad(waveform, (0, padding_needed)) else: waveform_padded = waveform # Step 2: Unfold windows = waveform_padded.unfold(0, frame_length, hop_length) # Step 3: RMS energies = torch.sqrt(torch.mean(windows ** 2, dim=1)) # Step 4: Stats mean_e = torch.mean(energies) std_e = torch.std(energies) # Step 5: Z-score z_scores = (energies - mean_e) / (std_e + 1e-8) peak_mask = z_scores > 1.5 return z_scores, peak_mask profiler.profile_operation("FULL AUDIO PIPELINE", full_pipeline) def check_tensor_device_location(tensor: torch.Tensor, name: str): """Verify where a tensor is actually stored.""" device_str = str(tensor.device) is_pinned = tensor.is_pinned() print(f" {name}:") print(f" Device: {device_str}") print(f" Pin memory: {is_pinned}") print(f" Shape: {tensor.shape}") print(f" Dtype: {tensor.dtype}") def verify_gpu_allocation(): """Check GPU memory allocation.""" if not torch.cuda.is_available(): print("\nCUDA is not available. Running CPU-only tests.") return False print(f"\nGPU Information:") print(f" Device: {torch.cuda.get_device_name(0)}") print(f" Compute Capability: {torch.cuda.get_device_capability(0)}") print(f" Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB") print(f" Current Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB") print(f" Current Memory Cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB") return True def main(): parser = argparse.ArgumentParser( description="Profile GPU usage in Twitch Highlight Detector" ) parser.add_argument( "--device", choices=["auto", "cuda", "cpu"], default="auto", help="Device to use (default: auto)" ) parser.add_argument( "--comprehensive", action="store_true", help="Run comprehensive tests including CPU transfer overhead" ) args = parser.parse_args() # Setup device if args.device == "auto": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: device = torch.device(args.device) print("=" * 100) print("PYTORCH CUDA GPU PROFILER".center(100)) print("=" * 100) print(f"\nTarget Device: {device}") has_gpu = verify_gpu_allocation() print(f"\nGPU Available: {has_gpu}") # Create profiler profiler = GPUProfiler(device) # Run tests print("\n" + "=" * 100) print("RUNNING GPU UTILIZATION TESTS".center(100)) print("=" * 100) test_tensor_operations(device, profiler) test_unfold_operation(device, profiler) test_window_statistics(device, profiler) test_zscore_detection(device, profiler) test_conv1d_smoothing(device, profiler) simulate_full_pipeline(device, profiler) if args.comprehensive and has_gpu: test_cpu_transfer_overhead(device, profiler) test_numpy_fallback_detection(device, profiler) # Print results profiler.print_results() # Analysis and recommendations print("\n" + "=" * 100) print("ANALYSIS AND RECOMMENDATIONS".center(100)) print("=" * 100) compute_ops = [name for name in profiler.results if name not in ["numpy -> GPU transfer", "GPU -> CPU item() x100", "GPU tensor -> numpy (BAD)"]] if compute_ops: avg_efficiency = np.mean([profiler.results[op]["efficiency"] for op in compute_ops]) print(f"\nAverage GPU Efficiency for Compute Operations: {avg_efficiency:.1f}%") if avg_efficiency >= 80: print(" Status: EXCELLENT - Code is well-optimized for GPU") elif avg_efficiency >= 50: print(" Status: GOOD - Some CPU overhead, but acceptable") else: print(" Status: POOR - Significant CPU bottlenecks detected") print("\n Recommendations:") print(" 1. Check for implicit CPU transfers in hot paths") print(" 2. Minimize .item() and .cpu() calls") print(" 3. Avoid numpy/scipy in GPU code") print(" 4. Use pin_memory=True for data loading") print(" 5. Batch operations to reduce kernel launch overhead") # Check specific issues if has_gpu: print("\n" + "-" * 100) print("SPECIFIC ISSUES DETECTED:") print("-" * 100) # Check for CPU bottlenecks bottlenecks = [ (name, metrics) for name, metrics in profiler.results.items() if metrics["device"] == "CUDA" and metrics["efficiency"] < 50 ] if bottlenecks: print("\nCPU BOTTLENECKS (efficiency < 50%):") for name, metrics in bottlenecks: print(f" - {name}: {metrics['efficiency']:.1f}%") print(f" Missing GPU time: {metrics['wall_ms'] - metrics['gpu_ms']:.2f} ms") else: print("\nNo CPU bottlenecks detected in compute operations!") # Check for transfer overhead if "numpy -> GPU transfer" in profiler.results: transfer_ms = profiler.results["numpy -> GPU transfer"]["gpu_ms"] print(f"\nCPU->GPU Transfer Overhead: {transfer_ms:.2f} ms for 1 minute audio") print(f" Recommendation: Use streaming or chunked loading for long audio") if "GPU -> CPU item() x100" in profiler.results: item_ms = profiler.results["GPU -> CPU item() x100"]["gpu_ms"] print(f"\nGPU->CPU Transfer (item() x100): {item_ms:.2f} ms") print(f" Per-item cost: {item_ms/100:.4f} ms") print(f" Recommendation: Batch results and transfer once") print("\n" + "=" * 100) print("Test complete!".center(100)) print("=" * 100 + "\n") if __name__ == "__main__": main()