CBCFacil v8.0 - Refactored with AMD GPU support

This commit is contained in:
2026-01-09 13:05:46 -03:00
parent cb17136f21
commit b017504c52
54 changed files with 7251 additions and 3670 deletions

187
amd/ROCM_SETUP.md Normal file
View File

@@ -0,0 +1,187 @@
# 🚀 ROCm Setup para AMD GPU
## ✅ Estado Actual del Sistema
**GPU**: AMD Radeon RX 6800 XT
**ROCm**: 6.0
**PyTorch**: 2.5.0+rocm6.0
## 📋 Comandos Esenciales
### Verificar GPU
```bash
# Información básica de la GPU
lspci | grep -i vga
# Estado en tiempo real de ROCm
rocm-smi
# Información detallada del sistema
rocminfo
```
### Variables de Entorno Críticas
```bash
# CRÍTICO para gfx1030 (RX 6000 series)
export HSA_OVERRIDE_GFX_VERSION=10.3.0
# Agregar al ~/.bashrc o ~/.zshrc
echo 'export HSA_OVERRIDE_GFX_VERSION=10.3.0' >> ~/.bashrc
source ~/.bashrc
```
### Verificar PyTorch con ROCm
```bash
# Test básico de PyTorch
python3 -c "
import torch
print(f'PyTorch: {torch.__version__}')
print(f'ROCm disponible: {torch.cuda.is_available()}')
print(f'Dispositivos: {torch.cuda.device_count()}')
if torch.cuda.is_available():
print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'Memoria: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB')
"
# Benchmark rápido
python3 -c "
import torch, time
a = torch.randn(4096, 4096, device='cuda')
b = torch.randn(4096, 4096, device='cuda')
start = time.time()
c = torch.matmul(a, b)
torch.cuda.synchronize()
print(f'GPU time: {time.time() - start:.4f}s')
"
```
## 🧪 Script de Stress Test
### Ejecutar Stress Test (2 minutos)
```bash
python3 /home/ren/gpu/rocm_stress_test.py
```
## 🔧 Troubleshooting
### Si ROCm no detecta la GPU:
```bash
# Verificar módulos del kernel
lsmod | grep amdgpu
lsmod | grep kfd
# Recargar módulos
sudo modprobe amdgpu
sudo modprobe kfd
# Verificar logs
dmesg | grep amdgpu
```
### Si PyTorch no encuentra ROCm:
```bash
# Reinstalar PyTorch con ROCm
pip uninstall torch torchvision torchaudio
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0
```
### Si hay errores de memoria:
```bash
# Limpiar cache de GPU
python3 -c "import torch; torch.cuda.empty_cache()"
# Verificar uso de memoria
rocm-smi --meminfo
```
## 📊 Monitoreo Continuo
### Terminal 1 - Monitor en tiempo real
```bash
watch -n 1 rocm-smi
```
### Terminal 2 - Información detallada
```bash
rocm-smi --showtemp --showmeminfo vram --showmeminfo all
```
## 💡 Ejemplos de Uso
### Cargar modelo en GPU
```python
import torch
from transformers import AutoModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")
model = AutoModel.from_pretrained("bert-base-uncased")
model = model.to(device)
# Los tensores ahora se procesarán en la GPU
inputs = torch.tensor([1, 2, 3]).to(device)
```
### Entrenamiento en GPU
```python
import torch
import torch.nn as nn
device = torch.device("cuda")
model = tu_modelo().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters())
for epoch in range(epochs):
for batch in dataloader:
inputs, labels = batch
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
```
## 🎯 Optimizaciones
### Para mejor rendimiento:
```python
# Usar mixed precision (más rápido en RDNA2)
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
with autocast():
output = model(inputs)
loss = criterion(output, targets)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
```
## 📈 Comandos Útiles
```bash
# Ver versión de ROCm
rocm-smi --version
# Verificar HSA
rocminfo
# Test de compatibilidad
python3 /opt/rocm/bin/rocprofiler-compute-test.py
# Verificar BLAS
python3 -c "import torch; print(torch.backends.mps.is_available())" # False en AMD
```
## ⚡ Performance Tips
1. **Siempre mueve datos a GPU**: `.to(device)`
2. **Usa batch sizes grandes**: Aprovecha los 16GB de VRAM
3. **Mixed precision**: Acelera el entrenamiento 1.5-2x
4. **DataLoader con num_workers**: Carga datos en paralelo
5. **torch.cuda.synchronize()**: Para benchmarks precisos

255
amd/rocm_stress_test.py Executable file
View File

@@ -0,0 +1,255 @@
#!/usr/bin/env python3
"""
🔥 ROCm Stress Test - Prueba de resistencia para GPU AMD
Ejecuta operaciones intensivas durante 2 minutos y monitorea métricas
"""
import torch
import time
import sys
from datetime import datetime
def print_header():
print("=" * 70)
print("🔥 ROCm STRESS TEST - AMD GPU STRESS TEST")
print("=" * 70)
print(f"Inicio: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")
print(f"VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
print("=" * 70)
print()
def get_gpu_stats():
"""Obtener estadísticas actuales de la GPU"""
if not torch.cuda.is_available():
return None
props = torch.cuda.get_device_properties(0)
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
mem_total = props.total_memory / 1024**3
return {
'mem_allocated': mem_allocated,
'mem_reserved': mem_reserved,
'mem_total': mem_total,
'mem_percent': (mem_allocated / mem_total) * 100
}
def stress_test(duration_seconds=120):
"""Ejecutar stress test durante duración especificada"""
print(f"🧪 Iniciando stress test por {duration_seconds} segundos...")
print(f" Presiona Ctrl+C para detener en cualquier momento\n")
if not torch.cuda.is_available():
print("❌ ERROR: CUDA/ROCm no está disponible!")
sys.exit(1)
device = torch.device("cuda")
torch.cuda.set_per_process_memory_fraction(0.85, 0) # Usar 85% de VRAM
# Inicializar
results = {
'matmul_times': [],
'conv_times': [],
'reLU_times': [],
'softmax_times': [],
'iterations': 0,
'errors': 0
}
start_time = time.time()
iteration = 0
last_print = 0
try:
while time.time() - start_time < duration_seconds:
iteration += 1
results['iterations'] = iteration
try:
# Operaciones intensivas de ML
# 1. Matriz multiplicación (varía el tamaño)
if iteration % 5 == 0:
size = 8192 # Matriz grande ocasionalmente
elif iteration % 3 == 0:
size = 4096
else:
size = 2048
a = torch.randn(size, size, device=device, dtype=torch.float16)
b = torch.randn(size, size, device=device, dtype=torch.float16)
torch.cuda.synchronize()
t0 = time.time()
c = torch.matmul(a, b)
torch.cuda.synchronize()
matmul_time = time.time() - t0
results['matmul_times'].append(matmul_time)
del a, b, c
# 2. Convolución 3D
x = torch.randn(32, 128, 64, 64, 64, device=device)
conv = torch.nn.Conv3d(128, 256, kernel_size=3, padding=1).to(device)
torch.cuda.synchronize()
t0 = time.time()
out = conv(x)
torch.cuda.synchronize()
conv_time = time.time() - t0
results['conv_times'].append(conv_time)
# 3. ReLU + BatchNorm
bn = torch.nn.BatchNorm2d(256).to(device)
torch.cuda.synchronize()
t0 = time.time()
out = bn(torch.relu(out))
torch.cuda.synchronize()
relu_time = time.time() - t0
results['reLU_times'].append(relu_time)
del x, out
# 4. Softmax grande
x = torch.randn(2048, 2048, device=device)
torch.cuda.synchronize()
t0 = time.time()
softmax_out = torch.softmax(x, dim=-1)
torch.cuda.synchronize()
softmax_time = time.time() - t0
results['softmax_times'].append(softmax_time)
del softmax_out
except Exception as e:
results['errors'] += 1
if results['errors'] < 5: # Solo mostrar primeros errores
print(f"\n⚠️ Error en iteración {iteration}: {str(e)[:50]}")
# Progress cada segundo
elapsed = time.time() - start_time
if elapsed - last_print >= 1.0 or elapsed >= duration_seconds:
last_print = elapsed
progress = (elapsed / duration_seconds) * 100
# Stats
stats = get_gpu_stats()
matmul_avg = sum(results['matmul_times'][-10:]) / len(results['matmul_times'][-10:]) if results['matmul_times'] else 0
print(f"\r⏱️ Iteración {iteration:4d} | "
f"Tiempo: {elapsed:6.1f}s/{duration_seconds}s [{progress:5.1f}%] | "
f"VRAM: {stats['mem_allocated']:5.2f}GB/{stats['mem_total']:.2f}GB ({stats['mem_percent']:5.1f}%) | "
f"MatMul avg: {matmul_avg*1000:6.2f}ms | "
f"Iter/s: {iteration/elapsed:5.2f}",
end='', flush=True)
# Pequeña pausa para evitar sobrecarga
time.sleep(0.05)
except KeyboardInterrupt:
print("\n\n⏹️ Interrumpido por el usuario")
elapsed = time.time() - start_time
print(f" Duración real: {elapsed:.1f} segundos")
print(f" Iteraciones: {iteration}")
print("\n")
return results
def print_summary(results, duration):
"""Imprimir resumen de resultados"""
print("\n" + "=" * 70)
print("📊 RESUMEN DEL STRESS TEST")
print("=" * 70)
print(f"Duración total: {duration:.2f} segundos")
print(f"Iteraciones completadas: {results['iterations']}")
print(f"Errores: {results['errors']}")
print()
if results['matmul_times']:
matmul_avg = sum(results['matmul_times']) / len(results['matmul_times'])
matmul_min = min(results['matmul_times'])
matmul_max = max(results['matmul_times'])
matmul_last10_avg = sum(results['matmul_times'][-10:]) / len(results['matmul_times'][-10:])
print(f"🔢 MATRIZ MULTIPLICACIÓN (2048-8192)")
print(f" Promedio: {matmul_avg*1000:8.2f} ms")
print(f" Últimas 10: {matmul_last10_avg*1000:8.2f} ms")
print(f" Mínimo: {matmul_min*1000:8.2f} ms")
print(f" Máximo: {matmul_max*1000:8.2f} ms")
print()
if results['conv_times']:
conv_avg = sum(results['conv_times']) / len(results['conv_times'])
conv_min = min(results['conv_times'])
conv_max = max(results['conv_times'])
print(f"🧮 CONVOLUCIÓN 3D (32x128x64³)")
print(f" Promedio: {conv_avg*1000:8.2f} ms")
print(f" Mínimo: {conv_min*1000:8.2f} ms")
print(f" Máximo: {conv_max*1000:8.2f} ms")
print()
if results['reLU_times']:
relu_avg = sum(results['reLU_times']) / len(results['reLU_times'])
relu_min = min(results['reLU_times'])
relu_max = max(results['reLU_times'])
print(f"⚡ ReLU + BatchNorm")
print(f" Promedio: {relu_avg*1000:8.4f} ms")
print(f" Mínimo: {relu_min*1000:8.4f} ms")
print(f" Máximo: {relu_max*1000:8.4f} ms")
print()
if results['softmax_times']:
softmax_avg = sum(results['softmax_times']) / len(results['softmax_times'])
softmax_min = min(results['softmax_times'])
softmax_max = max(results['softmax_times'])
print(f"🔥 Softmax (2048x2048)")
print(f" Promedio: {softmax_avg*1000:8.2f} ms")
print(f" Mínimo: {softmax_min*1000:8.2f} ms")
print(f" Máximo: {softmax_max*1000:8.2f} ms")
print()
# Performance score
total_ops = results['iterations']
if total_ops > 0 and duration > 0:
print("=" * 70)
print(f"✅ TEST COMPLETADO")
print(f" 📈 {total_ops} operaciones en {duration:.1f} segundos")
print(f"{total_ops/duration:.2f} operaciones/segundo")
print(f" 💾 Uso de VRAM: Hasta ~85% (configurado)")
print("=" * 70)
print()
# Calcular GFLOPS aproximado para matmul
if results['matmul_times']:
# GFLOPS = 2 * n^3 / (time * 10^9) para matriz n x n
avg_matmul_ms = (sum(results['matmul_times']) / len(results['matmul_times'])) * 1000
avg_n = sum([2048 if i%3==0 else 4096 if i%5==0 else 2048 for i in range(len(results['matmul_times']))]) / len(results['matmul_times'])
gflops = (2 * (avg_n**3)) / (avg_matmul_ms / 1000) / 1e9
print(f"🚀 RENDIMIENTO ESTIMADO")
print(f" ~{gflops:.2f} GFLOPS (matriz multiplicación)")
def main():
print_header()
# Verificar ROCm
if not torch.cuda.is_available():
print("❌ ERROR: ROCm/CUDA no está disponible!")
print(" Ejecuta: export HSA_OVERRIDE_GFX_VERSION=10.3.0")
sys.exit(1)
# Ejecutar stress test
duration = 120 # 2 minutos
start = time.time()
results = stress_test(duration)
actual_duration = time.time() - start
# Mostrar resumen
print_summary(results, actual_duration)
# Limpiar
torch.cuda.empty_cache()
print("🧹 Cache de GPU limpiado")
print("\n✅ Stress test finalizado")
if __name__ == "__main__":
main()