#!/usr/bin/env python3 """ 🔥 ROCm Stress Test - Prueba de resistencia para GPU AMD Ejecuta operaciones intensivas durante 2 minutos y monitorea métricas """ import torch import time import sys from datetime import datetime def print_header(): print("=" * 70) print("🔥 ROCm STRESS TEST - AMD GPU STRESS TEST") print("=" * 70) print(f"Inicio: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}") print(f"VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB") print("=" * 70) print() def get_gpu_stats(): """Obtener estadísticas actuales de la GPU""" if not torch.cuda.is_available(): return None props = torch.cuda.get_device_properties(0) mem_allocated = torch.cuda.memory_allocated(0) / 1024**3 mem_reserved = torch.cuda.memory_reserved(0) / 1024**3 mem_total = props.total_memory / 1024**3 return { 'mem_allocated': mem_allocated, 'mem_reserved': mem_reserved, 'mem_total': mem_total, 'mem_percent': (mem_allocated / mem_total) * 100 } def stress_test(duration_seconds=120): """Ejecutar stress test durante duración especificada""" print(f"🧪 Iniciando stress test por {duration_seconds} segundos...") print(f" Presiona Ctrl+C para detener en cualquier momento\n") if not torch.cuda.is_available(): print("❌ ERROR: CUDA/ROCm no está disponible!") sys.exit(1) device = torch.device("cuda") torch.cuda.set_per_process_memory_fraction(0.85, 0) # Usar 85% de VRAM # Inicializar results = { 'matmul_times': [], 'conv_times': [], 'reLU_times': [], 'softmax_times': [], 'iterations': 0, 'errors': 0 } start_time = time.time() iteration = 0 last_print = 0 try: while time.time() - start_time < duration_seconds: iteration += 1 results['iterations'] = iteration try: # Operaciones intensivas de ML # 1. Matriz multiplicación (varía el tamaño) if iteration % 5 == 0: size = 8192 # Matriz grande ocasionalmente elif iteration % 3 == 0: size = 4096 else: size = 2048 a = torch.randn(size, size, device=device, dtype=torch.float16) b = torch.randn(size, size, device=device, dtype=torch.float16) torch.cuda.synchronize() t0 = time.time() c = torch.matmul(a, b) torch.cuda.synchronize() matmul_time = time.time() - t0 results['matmul_times'].append(matmul_time) del a, b, c # 2. Convolución 3D x = torch.randn(32, 128, 64, 64, 64, device=device) conv = torch.nn.Conv3d(128, 256, kernel_size=3, padding=1).to(device) torch.cuda.synchronize() t0 = time.time() out = conv(x) torch.cuda.synchronize() conv_time = time.time() - t0 results['conv_times'].append(conv_time) # 3. ReLU + BatchNorm bn = torch.nn.BatchNorm2d(256).to(device) torch.cuda.synchronize() t0 = time.time() out = bn(torch.relu(out)) torch.cuda.synchronize() relu_time = time.time() - t0 results['reLU_times'].append(relu_time) del x, out # 4. Softmax grande x = torch.randn(2048, 2048, device=device) torch.cuda.synchronize() t0 = time.time() softmax_out = torch.softmax(x, dim=-1) torch.cuda.synchronize() softmax_time = time.time() - t0 results['softmax_times'].append(softmax_time) del softmax_out except Exception as e: results['errors'] += 1 if results['errors'] < 5: # Solo mostrar primeros errores print(f"\n⚠️ Error en iteración {iteration}: {str(e)[:50]}") # Progress cada segundo elapsed = time.time() - start_time if elapsed - last_print >= 1.0 or elapsed >= duration_seconds: last_print = elapsed progress = (elapsed / duration_seconds) * 100 # Stats stats = get_gpu_stats() matmul_avg = sum(results['matmul_times'][-10:]) / len(results['matmul_times'][-10:]) if results['matmul_times'] else 0 print(f"\r⏱️ Iteración {iteration:4d} | " f"Tiempo: {elapsed:6.1f}s/{duration_seconds}s [{progress:5.1f}%] | " f"VRAM: {stats['mem_allocated']:5.2f}GB/{stats['mem_total']:.2f}GB ({stats['mem_percent']:5.1f}%) | " f"MatMul avg: {matmul_avg*1000:6.2f}ms | " f"Iter/s: {iteration/elapsed:5.2f}", end='', flush=True) # Pequeña pausa para evitar sobrecarga time.sleep(0.05) except KeyboardInterrupt: print("\n\n⏹️ Interrumpido por el usuario") elapsed = time.time() - start_time print(f" Duración real: {elapsed:.1f} segundos") print(f" Iteraciones: {iteration}") print("\n") return results def print_summary(results, duration): """Imprimir resumen de resultados""" print("\n" + "=" * 70) print("📊 RESUMEN DEL STRESS TEST") print("=" * 70) print(f"Duración total: {duration:.2f} segundos") print(f"Iteraciones completadas: {results['iterations']}") print(f"Errores: {results['errors']}") print() if results['matmul_times']: matmul_avg = sum(results['matmul_times']) / len(results['matmul_times']) matmul_min = min(results['matmul_times']) matmul_max = max(results['matmul_times']) matmul_last10_avg = sum(results['matmul_times'][-10:]) / len(results['matmul_times'][-10:]) print(f"🔢 MATRIZ MULTIPLICACIÓN (2048-8192)") print(f" Promedio: {matmul_avg*1000:8.2f} ms") print(f" Últimas 10: {matmul_last10_avg*1000:8.2f} ms") print(f" Mínimo: {matmul_min*1000:8.2f} ms") print(f" Máximo: {matmul_max*1000:8.2f} ms") print() if results['conv_times']: conv_avg = sum(results['conv_times']) / len(results['conv_times']) conv_min = min(results['conv_times']) conv_max = max(results['conv_times']) print(f"🧮 CONVOLUCIÓN 3D (32x128x64³)") print(f" Promedio: {conv_avg*1000:8.2f} ms") print(f" Mínimo: {conv_min*1000:8.2f} ms") print(f" Máximo: {conv_max*1000:8.2f} ms") print() if results['reLU_times']: relu_avg = sum(results['reLU_times']) / len(results['reLU_times']) relu_min = min(results['reLU_times']) relu_max = max(results['reLU_times']) print(f"⚡ ReLU + BatchNorm") print(f" Promedio: {relu_avg*1000:8.4f} ms") print(f" Mínimo: {relu_min*1000:8.4f} ms") print(f" Máximo: {relu_max*1000:8.4f} ms") print() if results['softmax_times']: softmax_avg = sum(results['softmax_times']) / len(results['softmax_times']) softmax_min = min(results['softmax_times']) softmax_max = max(results['softmax_times']) print(f"🔥 Softmax (2048x2048)") print(f" Promedio: {softmax_avg*1000:8.2f} ms") print(f" Mínimo: {softmax_min*1000:8.2f} ms") print(f" Máximo: {softmax_max*1000:8.2f} ms") print() # Performance score total_ops = results['iterations'] if total_ops > 0 and duration > 0: print("=" * 70) print(f"✅ TEST COMPLETADO") print(f" 📈 {total_ops} operaciones en {duration:.1f} segundos") print(f" ⚡ {total_ops/duration:.2f} operaciones/segundo") print(f" 💾 Uso de VRAM: Hasta ~85% (configurado)") print("=" * 70) print() # Calcular GFLOPS aproximado para matmul if results['matmul_times']: # GFLOPS = 2 * n^3 / (time * 10^9) para matriz n x n avg_matmul_ms = (sum(results['matmul_times']) / len(results['matmul_times'])) * 1000 avg_n = sum([2048 if i%3==0 else 4096 if i%5==0 else 2048 for i in range(len(results['matmul_times']))]) / len(results['matmul_times']) gflops = (2 * (avg_n**3)) / (avg_matmul_ms / 1000) / 1e9 print(f"🚀 RENDIMIENTO ESTIMADO") print(f" ~{gflops:.2f} GFLOPS (matriz multiplicación)") def main(): print_header() # Verificar ROCm if not torch.cuda.is_available(): print("❌ ERROR: ROCm/CUDA no está disponible!") print(" Ejecuta: export HSA_OVERRIDE_GFX_VERSION=10.3.0") sys.exit(1) # Ejecutar stress test duration = 120 # 2 minutos start = time.time() results = stress_test(duration) actual_duration = time.time() - start # Mostrar resumen print_summary(results, actual_duration) # Limpiar torch.cuda.empty_cache() print("🧹 Cache de GPU limpiado") print("\n✅ Stress test finalizado") if __name__ == "__main__": main()