- Implementación de detector híbrido (Whisper + Chat + Audio + VLM) - Sistema de detección de gameplay real vs hablando - Scene detection con FFmpeg - Soporte para RTX 3050 y RX 6800 XT - Guía completa en 6800xt.md para próxima IA - Scripts de filtrado visual y análisis de contexto - Pipeline automatizado de generación de videos
466 lines
15 KiB
Python
Executable File
466 lines
15 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
GPU Performance Profiler for Twitch Highlight Detector
|
|
|
|
Measures actual GPU kernel execution time vs wall clock time to detect CPU bottlenecks.
|
|
Uses torch.cuda.Event() for precise GPU timing.
|
|
|
|
GPU efficiency < 50% indicates CPU bottlenecks (implicit transfers, numpy usage, etc.)
|
|
"""
|
|
|
|
import torch
|
|
import torch.nn.functional as F
|
|
import numpy as np
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple
|
|
import argparse
|
|
|
|
|
|
class GPUProfiler:
|
|
"""Profiles GPU operations with CUDA events."""
|
|
|
|
def __init__(self, device: torch.device):
|
|
self.device = device
|
|
self.results: Dict[str, Dict[str, float]] = {}
|
|
|
|
def profile_operation(self, name: str, func, *args, **kwargs) -> any:
|
|
"""
|
|
Profile an operation and record GPU vs wall clock time.
|
|
|
|
Returns:
|
|
Result of the function call
|
|
"""
|
|
if self.device.type == "cpu":
|
|
# CPU fallback - just measure wall time
|
|
start = time.perf_counter()
|
|
result = func(*args, **kwargs)
|
|
elapsed = time.perf_counter() - start
|
|
self.results[name] = {
|
|
"gpu_ms": elapsed * 1000,
|
|
"wall_ms": elapsed * 1000,
|
|
"efficiency": 100.0,
|
|
"device": "CPU"
|
|
}
|
|
return result
|
|
|
|
# Create CUDA events for precise timing
|
|
start_event = torch.cuda.Event(enable_timing=True)
|
|
end_event = torch.cuda.Event(enable_timing=True)
|
|
|
|
# Synchronize before starting
|
|
torch.cuda.synchronize()
|
|
|
|
# Start wall clock timer
|
|
wall_start = time.perf_counter()
|
|
|
|
# Record GPU start
|
|
start_event.record()
|
|
|
|
# Execute function
|
|
result = func(*args, **kwargs)
|
|
|
|
# Record GPU end
|
|
end_event.record()
|
|
|
|
# Synchronize to ensure GPU finished
|
|
torch.cuda.synchronize()
|
|
|
|
# End wall clock timer
|
|
wall_elapsed = (time.perf_counter() - wall_start) * 1000 # ms
|
|
|
|
# Get GPU elapsed time
|
|
gpu_elapsed = start_event.elapsed_time(end_event) # ms
|
|
|
|
# Calculate efficiency
|
|
efficiency = (gpu_elapsed / wall_elapsed * 100) if wall_elapsed > 0 else 0
|
|
|
|
self.results[name] = {
|
|
"gpu_ms": gpu_elapsed,
|
|
"wall_ms": wall_elapsed,
|
|
"efficiency": efficiency,
|
|
"device": "CUDA"
|
|
}
|
|
|
|
return result
|
|
|
|
def print_results(self):
|
|
"""Print profiling results in a formatted table."""
|
|
print("\n" + "=" * 100)
|
|
print("GPU PROFILING RESULTS".center(100))
|
|
print("=" * 100)
|
|
print(f"{'Operation':<30} {'GPU Time':<15} {'Wall Time':<15} {'Efficiency':<15} {'Status':<15}")
|
|
print("-" * 100)
|
|
|
|
for name, metrics in self.results.items():
|
|
gpu_time = metrics["gpu_ms"]
|
|
wall_time = metrics["wall_ms"]
|
|
efficiency = metrics["efficiency"]
|
|
device = metrics["device"]
|
|
|
|
# Determine status
|
|
if device == "CPU":
|
|
status = "CPU ONLY"
|
|
elif efficiency < 50:
|
|
status = "CPU BOTTLENECK"
|
|
elif efficiency < 80:
|
|
status = "MIXED"
|
|
else:
|
|
status = "GPU OPTIMIZED"
|
|
|
|
print(
|
|
f"{name:<30} "
|
|
f"{gpu_time:>10.2f} ms "
|
|
f"{wall_time:>10.2f} ms "
|
|
f"{efficiency:>10.1f}% "
|
|
f"{status:<15}"
|
|
)
|
|
|
|
print("=" * 100)
|
|
|
|
# Print warnings for bottlenecks
|
|
for name, metrics in self.results.items():
|
|
if metrics["efficiency"] < 50 and metrics["device"] == "CUDA":
|
|
print(f"\nWARNING: '{name}' has GPU efficiency < 50% - likely CPU bottleneck!")
|
|
print(f" GPU time: {metrics['gpu_ms']:.2f} ms, Wall time: {metrics['wall_time']:.2f} ms")
|
|
print(f" Missing GPU time: {metrics['wall_ms'] - metrics['gpu_ms']:.2f} ms")
|
|
|
|
|
|
def test_tensor_operations(device: torch.device, profiler: GPUProfiler):
|
|
"""Test basic tensor operations on GPU."""
|
|
print("\n[1] Testing Basic Tensor Operations")
|
|
|
|
# Create test data (synchronously)
|
|
torch.cuda.synchronize() if device.type == "cuda" else None
|
|
data = torch.randn(1000000, device=device)
|
|
|
|
def test_sqrt():
|
|
return torch.sqrt(data ** 2)
|
|
|
|
def test_mean():
|
|
return torch.mean(data)
|
|
|
|
def test_std():
|
|
return torch.std(data)
|
|
|
|
profiler.profile_operation("sqrt(square)", test_sqrt)
|
|
profiler.profile_operation("mean", test_mean)
|
|
profiler.profile_operation("std", test_std)
|
|
|
|
|
|
def test_unfold_operation(device: torch.device, profiler: GPUProfiler):
|
|
"""Test unfold (sliding window) operation - key for audio processing."""
|
|
print("\n[2] Testing Unfold Operation (Sliding Windows)")
|
|
|
|
# Simulate audio waveform (1 hour at 16kHz = 57.6M samples)
|
|
# Use smaller size for testing
|
|
samples = 16000 * 60 # 1 minute
|
|
waveform = torch.randn(samples, device=device)
|
|
|
|
frame_length = 16000 * 5 # 5 seconds
|
|
hop_length = 16000 # 1 second
|
|
|
|
def test_unfold():
|
|
# unfold creates sliding windows
|
|
return waveform.unfold(0, frame_length, hop_length)
|
|
|
|
profiler.profile_operation("unfold (sliding windows)", test_unfold)
|
|
|
|
|
|
def test_window_statistics(device: torch.device, profiler: GPUProfiler):
|
|
"""Test windowed RMS calculation (main audio processing operation)."""
|
|
print("\n[3] Testing Windowed RMS Calculation")
|
|
|
|
# Create windowed data (as in detector)
|
|
num_frames = 3600 # 1 hour worth of 1-second windows
|
|
frame_length = 80000 # 5 seconds at 16kHz
|
|
windows = torch.randn(num_frames, frame_length, device=device)
|
|
|
|
def test_rms():
|
|
return torch.sqrt(torch.mean(windows ** 2, dim=1))
|
|
|
|
profiler.profile_operation("RMS (windowed)", test_rms)
|
|
|
|
|
|
def test_zscore_detection(device: torch.device, profiler: GPUProfiler):
|
|
"""Test z-score peak detection."""
|
|
print("\n[4] Testing Z-Score Peak Detection")
|
|
|
|
energies = torch.randn(3600, device=device)
|
|
threshold = 1.5
|
|
|
|
def test_zscore():
|
|
mean_e = torch.mean(energies)
|
|
std_e = torch.std(energies)
|
|
z_scores = (energies - mean_e) / (std_e + 1e-8)
|
|
peak_mask = z_scores > threshold
|
|
return z_scores, peak_mask
|
|
|
|
profiler.profile_operation("z-score + peak detection", test_zscore)
|
|
|
|
|
|
def test_conv1d_smoothing(device: torch.device, profiler: GPUProfiler):
|
|
"""Test convolution for smoothing (used in score combination)."""
|
|
print("\n[5] Testing Conv1D Smoothing")
|
|
|
|
duration = 3600 # 1 hour
|
|
window = 3
|
|
kernel_size = window * 2 + 1
|
|
|
|
# Create sparse scores (like real highlight detection)
|
|
tensor = torch.zeros(duration, device=device)
|
|
indices = torch.randint(0, duration, (100,), device=device)
|
|
tensor[indices] = torch.randn(100, device=device)
|
|
|
|
def test_conv1d():
|
|
kernel = torch.ones(1, 1, kernel_size, device=device) / kernel_size
|
|
tensor_reshaped = tensor.unsqueeze(0).unsqueeze(0)
|
|
smoothed = F.conv1d(tensor_reshaped, kernel, padding=window).squeeze()
|
|
return smoothed
|
|
|
|
profiler.profile_operation("conv1d smoothing", test_conv1d)
|
|
|
|
|
|
def test_cpu_transfer_overhead(device: torch.device, profiler: GPUProfiler):
|
|
"""Test CPU-GPU transfer overhead."""
|
|
print("\n[6] Testing CPU-GPU Transfer Overhead")
|
|
|
|
if device.type == "cpu":
|
|
print(" Skipping (CPU device)")
|
|
return
|
|
|
|
# Create numpy array (like soundfile output)
|
|
data_np = np.random.randn(16000 * 60).astype(np.float32) # 1 minute audio
|
|
|
|
def test_transfer():
|
|
# This mimics load_audio_to_gpu
|
|
tensor = torch.from_numpy(data_np).pin_memory().to(device, non_blocking=True)
|
|
return tensor
|
|
|
|
profiler.profile_operation("numpy -> GPU transfer", test_transfer)
|
|
|
|
# Test item() transfer (GPU -> CPU)
|
|
gpu_tensor = torch.randn(1000, device=device)
|
|
|
|
def test_item_transfer():
|
|
# Mimics .item() calls in detector
|
|
return [gpu_tensor[i].item() for i in range(100)]
|
|
|
|
profiler.profile_operation("GPU -> CPU item() x100", test_item_transfer)
|
|
|
|
|
|
def test_numpy_fallback_detection(device: torch.device, profiler: GPUProfiler):
|
|
"""Detect if operations are falling back to CPU."""
|
|
print("\n[7] Testing for Implicit CPU Transfers")
|
|
|
|
if device.type == "cpu":
|
|
print(" Skipping (CPU device)")
|
|
return
|
|
|
|
# Test: operations that might implicitly transfer to CPU
|
|
gpu_tensor = torch.randn(10000, device=device)
|
|
|
|
def test_numpy_conversion():
|
|
# This should cause implicit transfer
|
|
result = gpu_tensor.cpu().numpy()
|
|
return result
|
|
|
|
profiler.profile_operation("GPU tensor -> numpy (BAD)", test_numpy_conversion)
|
|
|
|
# Test: proper GPU-only path
|
|
def test_gpu_only():
|
|
result = torch.sqrt(gpu_tensor ** 2)
|
|
return result
|
|
|
|
profiler.profile_operation("GPU tensor operations (GOOD)", test_gpu_only)
|
|
|
|
|
|
def simulate_full_pipeline(device: torch.device, profiler: GPUProfiler):
|
|
"""Simulate the full audio detection pipeline."""
|
|
print("\n[8] Simulating Full Audio Pipeline")
|
|
|
|
# Simulate 1 hour of audio
|
|
sr = 16000
|
|
duration_seconds = 3600 # 1 hour
|
|
samples = sr * duration_seconds
|
|
waveform = torch.randn(samples, device=device)
|
|
|
|
window_seconds = 5
|
|
hop_length = sr
|
|
frame_length = sr * window_seconds
|
|
|
|
def full_pipeline():
|
|
# Step 1: Pad
|
|
num_frames = 1 + (waveform.shape[-1] - frame_length) // hop_length
|
|
padding_needed = num_frames * hop_length + frame_length - waveform.shape[-1]
|
|
if padding_needed > 0:
|
|
waveform_padded = F.pad(waveform, (0, padding_needed))
|
|
else:
|
|
waveform_padded = waveform
|
|
|
|
# Step 2: Unfold
|
|
windows = waveform_padded.unfold(0, frame_length, hop_length)
|
|
|
|
# Step 3: RMS
|
|
energies = torch.sqrt(torch.mean(windows ** 2, dim=1))
|
|
|
|
# Step 4: Stats
|
|
mean_e = torch.mean(energies)
|
|
std_e = torch.std(energies)
|
|
|
|
# Step 5: Z-score
|
|
z_scores = (energies - mean_e) / (std_e + 1e-8)
|
|
peak_mask = z_scores > 1.5
|
|
|
|
return z_scores, peak_mask
|
|
|
|
profiler.profile_operation("FULL AUDIO PIPELINE", full_pipeline)
|
|
|
|
|
|
def check_tensor_device_location(tensor: torch.Tensor, name: str):
|
|
"""Verify where a tensor is actually stored."""
|
|
device_str = str(tensor.device)
|
|
is_pinned = tensor.is_pinned()
|
|
|
|
print(f" {name}:")
|
|
print(f" Device: {device_str}")
|
|
print(f" Pin memory: {is_pinned}")
|
|
print(f" Shape: {tensor.shape}")
|
|
print(f" Dtype: {tensor.dtype}")
|
|
|
|
|
|
def verify_gpu_allocation():
|
|
"""Check GPU memory allocation."""
|
|
if not torch.cuda.is_available():
|
|
print("\nCUDA is not available. Running CPU-only tests.")
|
|
return False
|
|
|
|
print(f"\nGPU Information:")
|
|
print(f" Device: {torch.cuda.get_device_name(0)}")
|
|
print(f" Compute Capability: {torch.cuda.get_device_capability(0)}")
|
|
print(f" Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
|
|
print(f" Current Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
|
|
print(f" Current Memory Cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
|
|
return True
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Profile GPU usage in Twitch Highlight Detector"
|
|
)
|
|
parser.add_argument(
|
|
"--device",
|
|
choices=["auto", "cuda", "cpu"],
|
|
default="auto",
|
|
help="Device to use (default: auto)"
|
|
)
|
|
parser.add_argument(
|
|
"--comprehensive",
|
|
action="store_true",
|
|
help="Run comprehensive tests including CPU transfer overhead"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Setup device
|
|
if args.device == "auto":
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
else:
|
|
device = torch.device(args.device)
|
|
|
|
print("=" * 100)
|
|
print("PYTORCH CUDA GPU PROFILER".center(100))
|
|
print("=" * 100)
|
|
print(f"\nTarget Device: {device}")
|
|
|
|
has_gpu = verify_gpu_allocation()
|
|
print(f"\nGPU Available: {has_gpu}")
|
|
|
|
# Create profiler
|
|
profiler = GPUProfiler(device)
|
|
|
|
# Run tests
|
|
print("\n" + "=" * 100)
|
|
print("RUNNING GPU UTILIZATION TESTS".center(100))
|
|
print("=" * 100)
|
|
|
|
test_tensor_operations(device, profiler)
|
|
test_unfold_operation(device, profiler)
|
|
test_window_statistics(device, profiler)
|
|
test_zscore_detection(device, profiler)
|
|
test_conv1d_smoothing(device, profiler)
|
|
simulate_full_pipeline(device, profiler)
|
|
|
|
if args.comprehensive and has_gpu:
|
|
test_cpu_transfer_overhead(device, profiler)
|
|
test_numpy_fallback_detection(device, profiler)
|
|
|
|
# Print results
|
|
profiler.print_results()
|
|
|
|
# Analysis and recommendations
|
|
print("\n" + "=" * 100)
|
|
print("ANALYSIS AND RECOMMENDATIONS".center(100))
|
|
print("=" * 100)
|
|
|
|
compute_ops = [name for name in profiler.results
|
|
if name not in ["numpy -> GPU transfer", "GPU -> CPU item() x100",
|
|
"GPU tensor -> numpy (BAD)"]]
|
|
|
|
if compute_ops:
|
|
avg_efficiency = np.mean([profiler.results[op]["efficiency"] for op in compute_ops])
|
|
print(f"\nAverage GPU Efficiency for Compute Operations: {avg_efficiency:.1f}%")
|
|
|
|
if avg_efficiency >= 80:
|
|
print(" Status: EXCELLENT - Code is well-optimized for GPU")
|
|
elif avg_efficiency >= 50:
|
|
print(" Status: GOOD - Some CPU overhead, but acceptable")
|
|
else:
|
|
print(" Status: POOR - Significant CPU bottlenecks detected")
|
|
print("\n Recommendations:")
|
|
print(" 1. Check for implicit CPU transfers in hot paths")
|
|
print(" 2. Minimize .item() and .cpu() calls")
|
|
print(" 3. Avoid numpy/scipy in GPU code")
|
|
print(" 4. Use pin_memory=True for data loading")
|
|
print(" 5. Batch operations to reduce kernel launch overhead")
|
|
|
|
# Check specific issues
|
|
if has_gpu:
|
|
print("\n" + "-" * 100)
|
|
print("SPECIFIC ISSUES DETECTED:")
|
|
print("-" * 100)
|
|
|
|
# Check for CPU bottlenecks
|
|
bottlenecks = [
|
|
(name, metrics)
|
|
for name, metrics in profiler.results.items()
|
|
if metrics["device"] == "CUDA" and metrics["efficiency"] < 50
|
|
]
|
|
|
|
if bottlenecks:
|
|
print("\nCPU BOTTLENECKS (efficiency < 50%):")
|
|
for name, metrics in bottlenecks:
|
|
print(f" - {name}: {metrics['efficiency']:.1f}%")
|
|
print(f" Missing GPU time: {metrics['wall_ms'] - metrics['gpu_ms']:.2f} ms")
|
|
else:
|
|
print("\nNo CPU bottlenecks detected in compute operations!")
|
|
|
|
# Check for transfer overhead
|
|
if "numpy -> GPU transfer" in profiler.results:
|
|
transfer_ms = profiler.results["numpy -> GPU transfer"]["gpu_ms"]
|
|
print(f"\nCPU->GPU Transfer Overhead: {transfer_ms:.2f} ms for 1 minute audio")
|
|
print(f" Recommendation: Use streaming or chunked loading for long audio")
|
|
|
|
if "GPU -> CPU item() x100" in profiler.results:
|
|
item_ms = profiler.results["GPU -> CPU item() x100"]["gpu_ms"]
|
|
print(f"\nGPU->CPU Transfer (item() x100): {item_ms:.2f} ms")
|
|
print(f" Per-item cost: {item_ms/100:.4f} ms")
|
|
print(f" Recommendation: Batch results and transfer once")
|
|
|
|
print("\n" + "=" * 100)
|
|
print("Test complete!".center(100))
|
|
print("=" * 100 + "\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|