#!/usr/bin/env python3
"""
GPU Performance Profiler for Twitch Highlight Detector

Measures actual GPU kernel execution time vs wall clock time to detect CPU bottlenecks.
Uses torch.cuda.Event() for precise GPU timing.

GPU efficiency < 50% indicates CPU bottlenecks (implicit transfers, numpy usage, etc.)
"""

import torch
import torch.nn.functional as F
import numpy as np
import time
from pathlib import Path
from typing import Dict, List, Tuple
import argparse


class GPUProfiler:
    """Profiles GPU operations with CUDA events."""

    def __init__(self, device: torch.device):
        self.device = device
        self.results: Dict[str, Dict[str, float]] = {}

    def profile_operation(self, name: str, func, *args, **kwargs) -> any:
        """
        Profile an operation and record GPU vs wall clock time.

        Returns:
            Result of the function call
        """
        if self.device.type == "cpu":
            # CPU fallback - just measure wall time
            start = time.perf_counter()
            result = func(*args, **kwargs)
            elapsed = time.perf_counter() - start
            self.results[name] = {
                "gpu_ms": elapsed * 1000,
                "wall_ms": elapsed * 1000,
                "efficiency": 100.0,
                "device": "CPU"
            }
            return result

        # Create CUDA events for precise timing
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

        # Synchronize before starting
        torch.cuda.synchronize()

        # Start wall clock timer
        wall_start = time.perf_counter()

        # Record GPU start
        start_event.record()

        # Execute function
        result = func(*args, **kwargs)

        # Record GPU end
        end_event.record()

        # Synchronize to ensure GPU finished
        torch.cuda.synchronize()

        # End wall clock timer
        wall_elapsed = (time.perf_counter() - wall_start) * 1000  # ms

        # Get GPU elapsed time
        gpu_elapsed = start_event.elapsed_time(end_event)  # ms

        # Calculate efficiency
        efficiency = (gpu_elapsed / wall_elapsed * 100) if wall_elapsed > 0 else 0

        self.results[name] = {
            "gpu_ms": gpu_elapsed,
            "wall_ms": wall_elapsed,
            "efficiency": efficiency,
            "device": "CUDA"
        }

        return result

    def print_results(self):
        """Print profiling results in a formatted table."""
        print("\n" + "=" * 100)
        print("GPU PROFILING RESULTS".center(100))
        print("=" * 100)
        print(f"{'Operation':<30} {'GPU Time':<15} {'Wall Time':<15} {'Efficiency':<15} {'Status':<15}")
        print("-" * 100)

        for name, metrics in self.results.items():
            gpu_time = metrics["gpu_ms"]
            wall_time = metrics["wall_ms"]
            efficiency = metrics["efficiency"]
            device = metrics["device"]

            # Determine status
            if device == "CPU":
                status = "CPU ONLY"
            elif efficiency < 50:
                status = "CPU BOTTLENECK"
            elif efficiency < 80:
                status = "MIXED"
            else:
                status = "GPU OPTIMIZED"

            print(
                f"{name:<30} "
                f"{gpu_time:>10.2f} ms  "
                f"{wall_time:>10.2f} ms  "
                f"{efficiency:>10.1f}%     "
                f"{status:<15}"
            )

        print("=" * 100)

        # Print warnings for bottlenecks
        for name, metrics in self.results.items():
            if metrics["efficiency"] < 50 and metrics["device"] == "CUDA":
                print(f"\nWARNING: '{name}' has GPU efficiency < 50% - likely CPU bottleneck!")
                print(f"  GPU time: {metrics['gpu_ms']:.2f} ms, Wall time: {metrics['wall_time']:.2f} ms")
                print(f"  Missing GPU time: {metrics['wall_ms'] - metrics['gpu_ms']:.2f} ms")


def test_tensor_operations(device: torch.device, profiler: GPUProfiler):
    """Test basic tensor operations on GPU."""
    print("\n[1] Testing Basic Tensor Operations")

    # Create test data (synchronously)
    torch.cuda.synchronize() if device.type == "cuda" else None
    data = torch.randn(1000000, device=device)

    def test_sqrt():
        return torch.sqrt(data ** 2)

    def test_mean():
        return torch.mean(data)

    def test_std():
        return torch.std(data)

    profiler.profile_operation("sqrt(square)", test_sqrt)
    profiler.profile_operation("mean", test_mean)
    profiler.profile_operation("std", test_std)


def test_unfold_operation(device: torch.device, profiler: GPUProfiler):
    """Test unfold (sliding window) operation - key for audio processing."""
    print("\n[2] Testing Unfold Operation (Sliding Windows)")

    # Simulate audio waveform (1 hour at 16kHz = 57.6M samples)
    # Use smaller size for testing
    samples = 16000 * 60  # 1 minute
    waveform = torch.randn(samples, device=device)

    frame_length = 16000 * 5  # 5 seconds
    hop_length = 16000  # 1 second

    def test_unfold():
        # unfold creates sliding windows
        return waveform.unfold(0, frame_length, hop_length)

    profiler.profile_operation("unfold (sliding windows)", test_unfold)


def test_window_statistics(device: torch.device, profiler: GPUProfiler):
    """Test windowed RMS calculation (main audio processing operation)."""
    print("\n[3] Testing Windowed RMS Calculation")

    # Create windowed data (as in detector)
    num_frames = 3600  # 1 hour worth of 1-second windows
    frame_length = 80000  # 5 seconds at 16kHz
    windows = torch.randn(num_frames, frame_length, device=device)

    def test_rms():
        return torch.sqrt(torch.mean(windows ** 2, dim=1))

    profiler.profile_operation("RMS (windowed)", test_rms)


def test_zscore_detection(device: torch.device, profiler: GPUProfiler):
    """Test z-score peak detection."""
    print("\n[4] Testing Z-Score Peak Detection")

    energies = torch.randn(3600, device=device)
    threshold = 1.5

    def test_zscore():
        mean_e = torch.mean(energies)
        std_e = torch.std(energies)
        z_scores = (energies - mean_e) / (std_e + 1e-8)
        peak_mask = z_scores > threshold
        return z_scores, peak_mask

    profiler.profile_operation("z-score + peak detection", test_zscore)


def test_conv1d_smoothing(device: torch.device, profiler: GPUProfiler):
    """Test convolution for smoothing (used in score combination)."""
    print("\n[5] Testing Conv1D Smoothing")

    duration = 3600  # 1 hour
    window = 3
    kernel_size = window * 2 + 1

    # Create sparse scores (like real highlight detection)
    tensor = torch.zeros(duration, device=device)
    indices = torch.randint(0, duration, (100,), device=device)
    tensor[indices] = torch.randn(100, device=device)

    def test_conv1d():
        kernel = torch.ones(1, 1, kernel_size, device=device) / kernel_size
        tensor_reshaped = tensor.unsqueeze(0).unsqueeze(0)
        smoothed = F.conv1d(tensor_reshaped, kernel, padding=window).squeeze()
        return smoothed

    profiler.profile_operation("conv1d smoothing", test_conv1d)


def test_cpu_transfer_overhead(device: torch.device, profiler: GPUProfiler):
    """Test CPU-GPU transfer overhead."""
    print("\n[6] Testing CPU-GPU Transfer Overhead")

    if device.type == "cpu":
        print("  Skipping (CPU device)")
        return

    # Create numpy array (like soundfile output)
    data_np = np.random.randn(16000 * 60).astype(np.float32)  # 1 minute audio

    def test_transfer():
        # This mimics load_audio_to_gpu
        tensor = torch.from_numpy(data_np).pin_memory().to(device, non_blocking=True)
        return tensor

    profiler.profile_operation("numpy -> GPU transfer", test_transfer)

    # Test item() transfer (GPU -> CPU)
    gpu_tensor = torch.randn(1000, device=device)

    def test_item_transfer():
        # Mimics .item() calls in detector
        return [gpu_tensor[i].item() for i in range(100)]

    profiler.profile_operation("GPU -> CPU item() x100", test_item_transfer)


def test_numpy_fallback_detection(device: torch.device, profiler: GPUProfiler):
    """Detect if operations are falling back to CPU."""
    print("\n[7] Testing for Implicit CPU Transfers")

    if device.type == "cpu":
        print("  Skipping (CPU device)")
        return

    # Test: operations that might implicitly transfer to CPU
    gpu_tensor = torch.randn(10000, device=device)

    def test_numpy_conversion():
        # This should cause implicit transfer
        result = gpu_tensor.cpu().numpy()
        return result

    profiler.profile_operation("GPU tensor -> numpy (BAD)", test_numpy_conversion)

    # Test: proper GPU-only path
    def test_gpu_only():
        result = torch.sqrt(gpu_tensor ** 2)
        return result

    profiler.profile_operation("GPU tensor operations (GOOD)", test_gpu_only)


def simulate_full_pipeline(device: torch.device, profiler: GPUProfiler):
    """Simulate the full audio detection pipeline."""
    print("\n[8] Simulating Full Audio Pipeline")

    # Simulate 1 hour of audio
    sr = 16000
    duration_seconds = 3600  # 1 hour
    samples = sr * duration_seconds
    waveform = torch.randn(samples, device=device)

    window_seconds = 5
    hop_length = sr
    frame_length = sr * window_seconds

    def full_pipeline():
        # Step 1: Pad
        num_frames = 1 + (waveform.shape[-1] - frame_length) // hop_length
        padding_needed = num_frames * hop_length + frame_length - waveform.shape[-1]
        if padding_needed > 0:
            waveform_padded = F.pad(waveform, (0, padding_needed))
        else:
            waveform_padded = waveform

        # Step 2: Unfold
        windows = waveform_padded.unfold(0, frame_length, hop_length)

        # Step 3: RMS
        energies = torch.sqrt(torch.mean(windows ** 2, dim=1))

        # Step 4: Stats
        mean_e = torch.mean(energies)
        std_e = torch.std(energies)

        # Step 5: Z-score
        z_scores = (energies - mean_e) / (std_e + 1e-8)
        peak_mask = z_scores > 1.5

        return z_scores, peak_mask

    profiler.profile_operation("FULL AUDIO PIPELINE", full_pipeline)


def check_tensor_device_location(tensor: torch.Tensor, name: str):
    """Verify where a tensor is actually stored."""
    device_str = str(tensor.device)
    is_pinned = tensor.is_pinned()

    print(f"  {name}:")
    print(f"    Device: {device_str}")
    print(f"    Pin memory: {is_pinned}")
    print(f"    Shape: {tensor.shape}")
    print(f"    Dtype: {tensor.dtype}")


def verify_gpu_allocation():
    """Check GPU memory allocation."""
    if not torch.cuda.is_available():
        print("\nCUDA is not available. Running CPU-only tests.")
        return False

    print(f"\nGPU Information:")
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  Compute Capability: {torch.cuda.get_device_capability(0)}")
    print(f"  Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"  Current Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    print(f"  Current Memory Cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
    return True


def main():
    parser = argparse.ArgumentParser(
        description="Profile GPU usage in Twitch Highlight Detector"
    )
    parser.add_argument(
        "--device",
        choices=["auto", "cuda", "cpu"],
        default="auto",
        help="Device to use (default: auto)"
    )
    parser.add_argument(
        "--comprehensive",
        action="store_true",
        help="Run comprehensive tests including CPU transfer overhead"
    )
    args = parser.parse_args()

    # Setup device
    if args.device == "auto":
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        device = torch.device(args.device)

    print("=" * 100)
    print("PYTORCH CUDA GPU PROFILER".center(100))
    print("=" * 100)
    print(f"\nTarget Device: {device}")

    has_gpu = verify_gpu_allocation()
    print(f"\nGPU Available: {has_gpu}")

    # Create profiler
    profiler = GPUProfiler(device)

    # Run tests
    print("\n" + "=" * 100)
    print("RUNNING GPU UTILIZATION TESTS".center(100))
    print("=" * 100)

    test_tensor_operations(device, profiler)
    test_unfold_operation(device, profiler)
    test_window_statistics(device, profiler)
    test_zscore_detection(device, profiler)
    test_conv1d_smoothing(device, profiler)
    simulate_full_pipeline(device, profiler)

    if args.comprehensive and has_gpu:
        test_cpu_transfer_overhead(device, profiler)
        test_numpy_fallback_detection(device, profiler)

    # Print results
    profiler.print_results()

    # Analysis and recommendations
    print("\n" + "=" * 100)
    print("ANALYSIS AND RECOMMENDATIONS".center(100))
    print("=" * 100)

    compute_ops = [name for name in profiler.results
                   if name not in ["numpy -> GPU transfer", "GPU -> CPU item() x100",
                                   "GPU tensor -> numpy (BAD)"]]

    if compute_ops:
        avg_efficiency = np.mean([profiler.results[op]["efficiency"] for op in compute_ops])
        print(f"\nAverage GPU Efficiency for Compute Operations: {avg_efficiency:.1f}%")

        if avg_efficiency >= 80:
            print("  Status: EXCELLENT - Code is well-optimized for GPU")
        elif avg_efficiency >= 50:
            print("  Status: GOOD - Some CPU overhead, but acceptable")
        else:
            print("  Status: POOR - Significant CPU bottlenecks detected")
            print("\n  Recommendations:")
            print("    1. Check for implicit CPU transfers in hot paths")
            print("    2. Minimize .item() and .cpu() calls")
            print("    3. Avoid numpy/scipy in GPU code")
            print("    4. Use pin_memory=True for data loading")
            print("    5. Batch operations to reduce kernel launch overhead")

    # Check specific issues
    if has_gpu:
        print("\n" + "-" * 100)
        print("SPECIFIC ISSUES DETECTED:")
        print("-" * 100)

        # Check for CPU bottlenecks
        bottlenecks = [
            (name, metrics)
            for name, metrics in profiler.results.items()
            if metrics["device"] == "CUDA" and metrics["efficiency"] < 50
        ]

        if bottlenecks:
            print("\nCPU BOTTLENECKS (efficiency < 50%):")
            for name, metrics in bottlenecks:
                print(f"  - {name}: {metrics['efficiency']:.1f}%")
                print(f"    Missing GPU time: {metrics['wall_ms'] - metrics['gpu_ms']:.2f} ms")
        else:
            print("\nNo CPU bottlenecks detected in compute operations!")

        # Check for transfer overhead
        if "numpy -> GPU transfer" in profiler.results:
            transfer_ms = profiler.results["numpy -> GPU transfer"]["gpu_ms"]
            print(f"\nCPU->GPU Transfer Overhead: {transfer_ms:.2f} ms for 1 minute audio")
            print(f"  Recommendation: Use streaming or chunked loading for long audio")

        if "GPU -> CPU item() x100" in profiler.results:
            item_ms = profiler.results["GPU -> CPU item() x100"]["gpu_ms"]
            print(f"\nGPU->CPU Transfer (item() x100): {item_ms:.2f} ms")
            print(f"  Per-item cost: {item_ms/100:.4f} ms")
            print(f"  Recommendation: Batch results and transfer once")

    print("\n" + "=" * 100)
    print("Test complete!".center(100))
    print("=" * 100 + "\n")


if __name__ == "__main__":
    main()