CBCFacil v8.0 - Refactored with AMD GPU support

2026-01-09 13:05:46 -03:00
parent cb17136f21
commit b017504c52
54 changed files with 7251 additions and 3670 deletions
--- a/amd/ROCM_SETUP.md
+++ b/amd/ROCM_SETUP.md
@@ -0,0 +1,187 @@
+# 🚀 ROCm Setup para AMD GPU
+
+## ✅ Estado Actual del Sistema
+
+**GPU**: AMD Radeon RX 6800 XT
+**ROCm**: 6.0
+**PyTorch**: 2.5.0+rocm6.0
+
+## 📋 Comandos Esenciales
+
+### Verificar GPU
+```bash
+# Información básica de la GPU
+lspci | grep -i vga
+
+# Estado en tiempo real de ROCm
+rocm-smi
+
+# Información detallada del sistema
+rocminfo
+```
+
+### Variables de Entorno Críticas
+```bash
+# CRÍTICO para gfx1030 (RX 6000 series)
+export HSA_OVERRIDE_GFX_VERSION=10.3.0
+
+# Agregar al ~/.bashrc o ~/.zshrc
+echo 'export HSA_OVERRIDE_GFX_VERSION=10.3.0' >> ~/.bashrc
+source ~/.bashrc
+```
+
+### Verificar PyTorch con ROCm
+```bash
+# Test básico de PyTorch
+python3 -c "
+import torch
+print(f'PyTorch: {torch.__version__}')
+print(f'ROCm disponible: {torch.cuda.is_available()}')
+print(f'Dispositivos: {torch.cuda.device_count()}')
+if torch.cuda.is_available():
+    print(f'GPU: {torch.cuda.get_device_name(0)}')
+    print(f'Memoria: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB')
+"
+
+# Benchmark rápido
+python3 -c "
+import torch, time
+a = torch.randn(4096, 4096, device='cuda')
+b = torch.randn(4096, 4096, device='cuda')
+start = time.time()
+c = torch.matmul(a, b)
+torch.cuda.synchronize()
+print(f'GPU time: {time.time() - start:.4f}s')
+"
+```
+
+## 🧪 Script de Stress Test
+
+### Ejecutar Stress Test (2 minutos)
+```bash
+python3 /home/ren/gpu/rocm_stress_test.py
+```
+
+## 🔧 Troubleshooting
+
+### Si ROCm no detecta la GPU:
+```bash
+# Verificar módulos del kernel
+lsmod | grep amdgpu
+lsmod | grep kfd
+
+# Recargar módulos
+sudo modprobe amdgpu
+sudo modprobe kfd
+
+# Verificar logs
+dmesg | grep amdgpu
+```
+
+### Si PyTorch no encuentra ROCm:
+```bash
+# Reinstalar PyTorch con ROCm
+pip uninstall torch torchvision torchaudio
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0
+```
+
+### Si hay errores de memoria:
+```bash
+# Limpiar cache de GPU
+python3 -c "import torch; torch.cuda.empty_cache()"
+
+# Verificar uso de memoria
+rocm-smi --meminfo
+```
+
+## 📊 Monitoreo Continuo
+
+### Terminal 1 - Monitor en tiempo real
+```bash
+watch -n 1 rocm-smi
+```
+
+### Terminal 2 - Información detallada
+```bash
+rocm-smi --showtemp --showmeminfo vram --showmeminfo all
+```
+
+## 💡 Ejemplos de Uso
+
+### Cargar modelo en GPU
+```python
+import torch
+from transformers import AutoModel
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Usando dispositivo: {device}")
+
+model = AutoModel.from_pretrained("bert-base-uncased")
+model = model.to(device)
+
+# Los tensores ahora se procesarán en la GPU
+inputs = torch.tensor([1, 2, 3]).to(device)
+```
+
+### Entrenamiento en GPU
+```python
+import torch
+import torch.nn as nn
+
+device = torch.device("cuda")
+model = tu_modelo().to(device)
+criterion = nn.CrossEntropyLoss().to(device)
+optimizer = torch.optim.Adam(model.parameters())
+
+for epoch in range(epochs):
+    for batch in dataloader:
+        inputs, labels = batch
+        inputs, labels = inputs.to(device), labels.to(device)
+
+        optimizer.zero_grad()
+        outputs = model(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+```
+
+## 🎯 Optimizaciones
+
+### Para mejor rendimiento:
+```python
+# Usar mixed precision (más rápido en RDNA2)
+from torch.cuda.amp import autocast, GradScaler
+
+scaler = GradScaler()
+with autocast():
+    output = model(inputs)
+    loss = criterion(output, targets)
+
+scaler.scale(loss).backward()
+scaler.step(optimizer)
+scaler.update()
+```
+
+## 📈 Comandos Útiles
+
+```bash
+# Ver versión de ROCm
+rocm-smi --version
+
+# Verificar HSA
+rocminfo
+
+# Test de compatibilidad
+python3 /opt/rocm/bin/rocprofiler-compute-test.py
+
+# Verificar BLAS
+python3 -c "import torch; print(torch.backends.mps.is_available())"  # False en AMD
+```
+
+## ⚡ Performance Tips
+
+1. **Siempre mueve datos a GPU**: `.to(device)`
+2. **Usa batch sizes grandes**: Aprovecha los 16GB de VRAM
+3. **Mixed precision**: Acelera el entrenamiento 1.5-2x
+4. **DataLoader con num_workers**: Carga datos en paralelo
+5. **torch.cuda.synchronize()**: Para benchmarks precisos
--- a/amd/rocm_stress_test.py
+++ b/amd/rocm_stress_test.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+🔥 ROCm Stress Test - Prueba de resistencia para GPU AMD
+Ejecuta operaciones intensivas durante 2 minutos y monitorea métricas
+"""
+
+import torch
+import time
+import sys
+from datetime import datetime
+
+def print_header():
+    print("=" * 70)
+    print("🔥 ROCm STRESS TEST - AMD GPU STRESS TEST")
+    print("=" * 70)
+    print(f"Inicio: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")
+    print(f"VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
+    print("=" * 70)
+    print()
+
+def get_gpu_stats():
+    """Obtener estadísticas actuales de la GPU"""
+    if not torch.cuda.is_available():
+        return None
+
+    props = torch.cuda.get_device_properties(0)
+    mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
+    mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
+    mem_total = props.total_memory / 1024**3
+
+    return {
+        'mem_allocated': mem_allocated,
+        'mem_reserved': mem_reserved,
+        'mem_total': mem_total,
+        'mem_percent': (mem_allocated / mem_total) * 100
+    }
+
+def stress_test(duration_seconds=120):
+    """Ejecutar stress test durante duración especificada"""
+    print(f"🧪 Iniciando stress test por {duration_seconds} segundos...")
+    print(f"   Presiona Ctrl+C para detener en cualquier momento\n")
+
+    if not torch.cuda.is_available():
+        print("❌ ERROR: CUDA/ROCm no está disponible!")
+        sys.exit(1)
+
+    device = torch.device("cuda")
+    torch.cuda.set_per_process_memory_fraction(0.85, 0)  # Usar 85% de VRAM
+
+    # Inicializar
+    results = {
+        'matmul_times': [],
+        'conv_times': [],
+        'reLU_times': [],
+        'softmax_times': [],
+        'iterations': 0,
+        'errors': 0
+    }
+
+    start_time = time.time()
+    iteration = 0
+    last_print = 0
+
+    try:
+        while time.time() - start_time < duration_seconds:
+            iteration += 1
+            results['iterations'] = iteration
+
+            try:
+                # Operaciones intensivas de ML
+                # 1. Matriz multiplicación (varía el tamaño)
+                if iteration % 5 == 0:
+                    size = 8192  # Matriz grande ocasionalmente
+                elif iteration % 3 == 0:
+                    size = 4096
+                else:
+                    size = 2048
+
+                a = torch.randn(size, size, device=device, dtype=torch.float16)
+                b = torch.randn(size, size, device=device, dtype=torch.float16)
+
+                torch.cuda.synchronize()
+                t0 = time.time()
+                c = torch.matmul(a, b)
+                torch.cuda.synchronize()
+                matmul_time = time.time() - t0
+                results['matmul_times'].append(matmul_time)
+
+                del a, b, c
+
+                # 2. Convolución 3D
+                x = torch.randn(32, 128, 64, 64, 64, device=device)
+                conv = torch.nn.Conv3d(128, 256, kernel_size=3, padding=1).to(device)
+
+                torch.cuda.synchronize()
+                t0 = time.time()
+                out = conv(x)
+                torch.cuda.synchronize()
+                conv_time = time.time() - t0
+                results['conv_times'].append(conv_time)
+
+                # 3. ReLU + BatchNorm
+                bn = torch.nn.BatchNorm2d(256).to(device)
+                torch.cuda.synchronize()
+                t0 = time.time()
+                out = bn(torch.relu(out))
+                torch.cuda.synchronize()
+                relu_time = time.time() - t0
+                results['reLU_times'].append(relu_time)
+
+                del x, out
+
+                # 4. Softmax grande
+                x = torch.randn(2048, 2048, device=device)
+                torch.cuda.synchronize()
+                t0 = time.time()
+                softmax_out = torch.softmax(x, dim=-1)
+                torch.cuda.synchronize()
+                softmax_time = time.time() - t0
+                results['softmax_times'].append(softmax_time)
+
+                del softmax_out
+
+            except Exception as e:
+                results['errors'] += 1
+                if results['errors'] < 5:  # Solo mostrar primeros errores
+                    print(f"\n⚠️  Error en iteración {iteration}: {str(e)[:50]}")
+
+            # Progress cada segundo
+            elapsed = time.time() - start_time
+            if elapsed - last_print >= 1.0 or elapsed >= duration_seconds:
+                last_print = elapsed
+                progress = (elapsed / duration_seconds) * 100
+
+                # Stats
+                stats = get_gpu_stats()
+                matmul_avg = sum(results['matmul_times'][-10:]) / len(results['matmul_times'][-10:]) if results['matmul_times'] else 0
+
+                print(f"\r⏱️  Iteración {iteration:4d} | "
+                      f"Tiempo: {elapsed:6.1f}s/{duration_seconds}s [{progress:5.1f}%] | "
+                      f"VRAM: {stats['mem_allocated']:5.2f}GB/{stats['mem_total']:.2f}GB ({stats['mem_percent']:5.1f}%) | "
+                      f"MatMul avg: {matmul_avg*1000:6.2f}ms | "
+                      f"Iter/s: {iteration/elapsed:5.2f}",
+                      end='', flush=True)
+
+            # Pequeña pausa para evitar sobrecarga
+            time.sleep(0.05)
+
+    except KeyboardInterrupt:
+        print("\n\n⏹️  Interrumpido por el usuario")
+        elapsed = time.time() - start_time
+        print(f"   Duración real: {elapsed:.1f} segundos")
+        print(f"   Iteraciones: {iteration}")
+
+    print("\n")
+    return results
+
+def print_summary(results, duration):
+    """Imprimir resumen de resultados"""
+    print("\n" + "=" * 70)
+    print("📊 RESUMEN DEL STRESS TEST")
+    print("=" * 70)
+    print(f"Duración total: {duration:.2f} segundos")
+    print(f"Iteraciones completadas: {results['iterations']}")
+    print(f"Errores: {results['errors']}")
+    print()
+
+    if results['matmul_times']:
+        matmul_avg = sum(results['matmul_times']) / len(results['matmul_times'])
+        matmul_min = min(results['matmul_times'])
+        matmul_max = max(results['matmul_times'])
+        matmul_last10_avg = sum(results['matmul_times'][-10:]) / len(results['matmul_times'][-10:])
+        print(f"🔢 MATRIZ MULTIPLICACIÓN (2048-8192)")
+        print(f"  Promedio:     {matmul_avg*1000:8.2f} ms")
+        print(f"  Últimas 10:   {matmul_last10_avg*1000:8.2f} ms")
+        print(f"  Mínimo:       {matmul_min*1000:8.2f} ms")
+        print(f"  Máximo:       {matmul_max*1000:8.2f} ms")
+        print()
+
+    if results['conv_times']:
+        conv_avg = sum(results['conv_times']) / len(results['conv_times'])
+        conv_min = min(results['conv_times'])
+        conv_max = max(results['conv_times'])
+        print(f"🧮 CONVOLUCIÓN 3D (32x128x64³)")
+        print(f"  Promedio:     {conv_avg*1000:8.2f} ms")
+        print(f"  Mínimo:       {conv_min*1000:8.2f} ms")
+        print(f"  Máximo:       {conv_max*1000:8.2f} ms")
+        print()
+
+    if results['reLU_times']:
+        relu_avg = sum(results['reLU_times']) / len(results['reLU_times'])
+        relu_min = min(results['reLU_times'])
+        relu_max = max(results['reLU_times'])
+        print(f"⚡ ReLU + BatchNorm")
+        print(f"  Promedio:     {relu_avg*1000:8.4f} ms")
+        print(f"  Mínimo:       {relu_min*1000:8.4f} ms")
+        print(f"  Máximo:       {relu_max*1000:8.4f} ms")
+        print()
+
+    if results['softmax_times']:
+        softmax_avg = sum(results['softmax_times']) / len(results['softmax_times'])
+        softmax_min = min(results['softmax_times'])
+        softmax_max = max(results['softmax_times'])
+        print(f"🔥 Softmax (2048x2048)")
+        print(f"  Promedio:     {softmax_avg*1000:8.2f} ms")
+        print(f"  Mínimo:       {softmax_min*1000:8.2f} ms")
+        print(f"  Máximo:       {softmax_max*1000:8.2f} ms")
+        print()
+
+    # Performance score
+    total_ops = results['iterations']
+    if total_ops > 0 and duration > 0:
+        print("=" * 70)
+        print(f"✅ TEST COMPLETADO")
+        print(f"   📈 {total_ops} operaciones en {duration:.1f} segundos")
+        print(f"   ⚡ {total_ops/duration:.2f} operaciones/segundo")
+        print(f"   💾 Uso de VRAM: Hasta ~85% (configurado)")
+        print("=" * 70)
+        print()
+
+        # Calcular GFLOPS aproximado para matmul
+        if results['matmul_times']:
+            # GFLOPS = 2 * n^3 / (time * 10^9) para matriz n x n
+            avg_matmul_ms = (sum(results['matmul_times']) / len(results['matmul_times'])) * 1000
+            avg_n = sum([2048 if i%3==0 else 4096 if i%5==0 else 2048 for i in range(len(results['matmul_times']))]) / len(results['matmul_times'])
+            gflops = (2 * (avg_n**3)) / (avg_matmul_ms / 1000) / 1e9
+            print(f"🚀 RENDIMIENTO ESTIMADO")
+            print(f"   ~{gflops:.2f} GFLOPS (matriz multiplicación)")
+
+def main():
+    print_header()
+
+    # Verificar ROCm
+    if not torch.cuda.is_available():
+        print("❌ ERROR: ROCm/CUDA no está disponible!")
+        print("   Ejecuta: export HSA_OVERRIDE_GFX_VERSION=10.3.0")
+        sys.exit(1)
+
+    # Ejecutar stress test
+    duration = 120  # 2 minutos
+    start = time.time()
+    results = stress_test(duration)
+    actual_duration = time.time() - start
+
+    # Mostrar resumen
+    print_summary(results, actual_duration)
+
+    # Limpiar
+    torch.cuda.empty_cache()
+    print("🧹 Cache de GPU limpiado")
+    print("\n✅ Stress test finalizado")
+
+if __name__ == "__main__":
+    main()