256 lines
9.3 KiB
Python
Executable File
256 lines
9.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
🔥 ROCm Stress Test - Prueba de resistencia para GPU AMD
|
|
Ejecuta operaciones intensivas durante 2 minutos y monitorea métricas
|
|
"""
|
|
|
|
import torch
|
|
import time
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
def print_header():
|
|
print("=" * 70)
|
|
print("🔥 ROCm STRESS TEST - AMD GPU STRESS TEST")
|
|
print("=" * 70)
|
|
print(f"Inicio: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")
|
|
print(f"VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
def get_gpu_stats():
|
|
"""Obtener estadísticas actuales de la GPU"""
|
|
if not torch.cuda.is_available():
|
|
return None
|
|
|
|
props = torch.cuda.get_device_properties(0)
|
|
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
|
mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
|
mem_total = props.total_memory / 1024**3
|
|
|
|
return {
|
|
'mem_allocated': mem_allocated,
|
|
'mem_reserved': mem_reserved,
|
|
'mem_total': mem_total,
|
|
'mem_percent': (mem_allocated / mem_total) * 100
|
|
}
|
|
|
|
def stress_test(duration_seconds=120):
|
|
"""Ejecutar stress test durante duración especificada"""
|
|
print(f"🧪 Iniciando stress test por {duration_seconds} segundos...")
|
|
print(f" Presiona Ctrl+C para detener en cualquier momento\n")
|
|
|
|
if not torch.cuda.is_available():
|
|
print("❌ ERROR: CUDA/ROCm no está disponible!")
|
|
sys.exit(1)
|
|
|
|
device = torch.device("cuda")
|
|
torch.cuda.set_per_process_memory_fraction(0.85, 0) # Usar 85% de VRAM
|
|
|
|
# Inicializar
|
|
results = {
|
|
'matmul_times': [],
|
|
'conv_times': [],
|
|
'reLU_times': [],
|
|
'softmax_times': [],
|
|
'iterations': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
start_time = time.time()
|
|
iteration = 0
|
|
last_print = 0
|
|
|
|
try:
|
|
while time.time() - start_time < duration_seconds:
|
|
iteration += 1
|
|
results['iterations'] = iteration
|
|
|
|
try:
|
|
# Operaciones intensivas de ML
|
|
# 1. Matriz multiplicación (varía el tamaño)
|
|
if iteration % 5 == 0:
|
|
size = 8192 # Matriz grande ocasionalmente
|
|
elif iteration % 3 == 0:
|
|
size = 4096
|
|
else:
|
|
size = 2048
|
|
|
|
a = torch.randn(size, size, device=device, dtype=torch.float16)
|
|
b = torch.randn(size, size, device=device, dtype=torch.float16)
|
|
|
|
torch.cuda.synchronize()
|
|
t0 = time.time()
|
|
c = torch.matmul(a, b)
|
|
torch.cuda.synchronize()
|
|
matmul_time = time.time() - t0
|
|
results['matmul_times'].append(matmul_time)
|
|
|
|
del a, b, c
|
|
|
|
# 2. Convolución 3D
|
|
x = torch.randn(32, 128, 64, 64, 64, device=device)
|
|
conv = torch.nn.Conv3d(128, 256, kernel_size=3, padding=1).to(device)
|
|
|
|
torch.cuda.synchronize()
|
|
t0 = time.time()
|
|
out = conv(x)
|
|
torch.cuda.synchronize()
|
|
conv_time = time.time() - t0
|
|
results['conv_times'].append(conv_time)
|
|
|
|
# 3. ReLU + BatchNorm
|
|
bn = torch.nn.BatchNorm2d(256).to(device)
|
|
torch.cuda.synchronize()
|
|
t0 = time.time()
|
|
out = bn(torch.relu(out))
|
|
torch.cuda.synchronize()
|
|
relu_time = time.time() - t0
|
|
results['reLU_times'].append(relu_time)
|
|
|
|
del x, out
|
|
|
|
# 4. Softmax grande
|
|
x = torch.randn(2048, 2048, device=device)
|
|
torch.cuda.synchronize()
|
|
t0 = time.time()
|
|
softmax_out = torch.softmax(x, dim=-1)
|
|
torch.cuda.synchronize()
|
|
softmax_time = time.time() - t0
|
|
results['softmax_times'].append(softmax_time)
|
|
|
|
del softmax_out
|
|
|
|
except Exception as e:
|
|
results['errors'] += 1
|
|
if results['errors'] < 5: # Solo mostrar primeros errores
|
|
print(f"\n⚠️ Error en iteración {iteration}: {str(e)[:50]}")
|
|
|
|
# Progress cada segundo
|
|
elapsed = time.time() - start_time
|
|
if elapsed - last_print >= 1.0 or elapsed >= duration_seconds:
|
|
last_print = elapsed
|
|
progress = (elapsed / duration_seconds) * 100
|
|
|
|
# Stats
|
|
stats = get_gpu_stats()
|
|
matmul_avg = sum(results['matmul_times'][-10:]) / len(results['matmul_times'][-10:]) if results['matmul_times'] else 0
|
|
|
|
print(f"\r⏱️ Iteración {iteration:4d} | "
|
|
f"Tiempo: {elapsed:6.1f}s/{duration_seconds}s [{progress:5.1f}%] | "
|
|
f"VRAM: {stats['mem_allocated']:5.2f}GB/{stats['mem_total']:.2f}GB ({stats['mem_percent']:5.1f}%) | "
|
|
f"MatMul avg: {matmul_avg*1000:6.2f}ms | "
|
|
f"Iter/s: {iteration/elapsed:5.2f}",
|
|
end='', flush=True)
|
|
|
|
# Pequeña pausa para evitar sobrecarga
|
|
time.sleep(0.05)
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n\n⏹️ Interrumpido por el usuario")
|
|
elapsed = time.time() - start_time
|
|
print(f" Duración real: {elapsed:.1f} segundos")
|
|
print(f" Iteraciones: {iteration}")
|
|
|
|
print("\n")
|
|
return results
|
|
|
|
def print_summary(results, duration):
|
|
"""Imprimir resumen de resultados"""
|
|
print("\n" + "=" * 70)
|
|
print("📊 RESUMEN DEL STRESS TEST")
|
|
print("=" * 70)
|
|
print(f"Duración total: {duration:.2f} segundos")
|
|
print(f"Iteraciones completadas: {results['iterations']}")
|
|
print(f"Errores: {results['errors']}")
|
|
print()
|
|
|
|
if results['matmul_times']:
|
|
matmul_avg = sum(results['matmul_times']) / len(results['matmul_times'])
|
|
matmul_min = min(results['matmul_times'])
|
|
matmul_max = max(results['matmul_times'])
|
|
matmul_last10_avg = sum(results['matmul_times'][-10:]) / len(results['matmul_times'][-10:])
|
|
print(f"🔢 MATRIZ MULTIPLICACIÓN (2048-8192)")
|
|
print(f" Promedio: {matmul_avg*1000:8.2f} ms")
|
|
print(f" Últimas 10: {matmul_last10_avg*1000:8.2f} ms")
|
|
print(f" Mínimo: {matmul_min*1000:8.2f} ms")
|
|
print(f" Máximo: {matmul_max*1000:8.2f} ms")
|
|
print()
|
|
|
|
if results['conv_times']:
|
|
conv_avg = sum(results['conv_times']) / len(results['conv_times'])
|
|
conv_min = min(results['conv_times'])
|
|
conv_max = max(results['conv_times'])
|
|
print(f"🧮 CONVOLUCIÓN 3D (32x128x64³)")
|
|
print(f" Promedio: {conv_avg*1000:8.2f} ms")
|
|
print(f" Mínimo: {conv_min*1000:8.2f} ms")
|
|
print(f" Máximo: {conv_max*1000:8.2f} ms")
|
|
print()
|
|
|
|
if results['reLU_times']:
|
|
relu_avg = sum(results['reLU_times']) / len(results['reLU_times'])
|
|
relu_min = min(results['reLU_times'])
|
|
relu_max = max(results['reLU_times'])
|
|
print(f"⚡ ReLU + BatchNorm")
|
|
print(f" Promedio: {relu_avg*1000:8.4f} ms")
|
|
print(f" Mínimo: {relu_min*1000:8.4f} ms")
|
|
print(f" Máximo: {relu_max*1000:8.4f} ms")
|
|
print()
|
|
|
|
if results['softmax_times']:
|
|
softmax_avg = sum(results['softmax_times']) / len(results['softmax_times'])
|
|
softmax_min = min(results['softmax_times'])
|
|
softmax_max = max(results['softmax_times'])
|
|
print(f"🔥 Softmax (2048x2048)")
|
|
print(f" Promedio: {softmax_avg*1000:8.2f} ms")
|
|
print(f" Mínimo: {softmax_min*1000:8.2f} ms")
|
|
print(f" Máximo: {softmax_max*1000:8.2f} ms")
|
|
print()
|
|
|
|
# Performance score
|
|
total_ops = results['iterations']
|
|
if total_ops > 0 and duration > 0:
|
|
print("=" * 70)
|
|
print(f"✅ TEST COMPLETADO")
|
|
print(f" 📈 {total_ops} operaciones en {duration:.1f} segundos")
|
|
print(f" ⚡ {total_ops/duration:.2f} operaciones/segundo")
|
|
print(f" 💾 Uso de VRAM: Hasta ~85% (configurado)")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Calcular GFLOPS aproximado para matmul
|
|
if results['matmul_times']:
|
|
# GFLOPS = 2 * n^3 / (time * 10^9) para matriz n x n
|
|
avg_matmul_ms = (sum(results['matmul_times']) / len(results['matmul_times'])) * 1000
|
|
avg_n = sum([2048 if i%3==0 else 4096 if i%5==0 else 2048 for i in range(len(results['matmul_times']))]) / len(results['matmul_times'])
|
|
gflops = (2 * (avg_n**3)) / (avg_matmul_ms / 1000) / 1e9
|
|
print(f"🚀 RENDIMIENTO ESTIMADO")
|
|
print(f" ~{gflops:.2f} GFLOPS (matriz multiplicación)")
|
|
|
|
def main():
|
|
print_header()
|
|
|
|
# Verificar ROCm
|
|
if not torch.cuda.is_available():
|
|
print("❌ ERROR: ROCm/CUDA no está disponible!")
|
|
print(" Ejecuta: export HSA_OVERRIDE_GFX_VERSION=10.3.0")
|
|
sys.exit(1)
|
|
|
|
# Ejecutar stress test
|
|
duration = 120 # 2 minutos
|
|
start = time.time()
|
|
results = stress_test(duration)
|
|
actual_duration = time.time() - start
|
|
|
|
# Mostrar resumen
|
|
print_summary(results, actual_duration)
|
|
|
|
# Limpiar
|
|
torch.cuda.empty_cache()
|
|
print("🧹 Cache de GPU limpiado")
|
|
print("\n✅ Stress test finalizado")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|