cbc2027/processors/audio_processor.py

"""
Procesador de audio para transcripción con Whisper.

OPTIMIZACIONES DE MEMORIA PARA GPUs DE 8GB:
    - Cache global singleton para evitar carga múltiple del modelo
    - Configuración PYTORCH_ALLOC_CONF para reducir fragmentación
    - Verificación de VRAM antes de cargar
    - Fallback automático a CPU si GPU OOM
    - Limpieza agresiva de cache CUDA
"""
import gc
import logging
import os
import subprocess
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
from pathlib import Path
from typing import Dict, Literal, Optional, Tuple

import whisper

from config import settings
from services.vram_manager import vram_manager


logger = logging.getLogger(__name__)


# ============ CONFIGURACIÓN DE OPTIMIZACIONES ============

# CRÍTICO: Permite segmentos expandibles para reducir fragmentación
os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

# Tamaños de modelos en GB (incluyendo overhead)
MODEL_MEMORY_REQUIREMENTS = {
    "tiny": 0.5, "base": 0.8, "small": 1.5,
    "medium": 2.5, "large": 4.5,
}

# Cache global singleton - CLAVE para evitar OOM
_model_cache: Dict[str, Tuple[whisper.Whisper, str, float]] = {}

TRANSCRIPTION_TIMEOUT_SECONDS = 600
MAX_RETRY_ATTEMPTS = 2
RETRY_DELAY_SECONDS = 5


# ============ FUNCIONES DE GESTIÓN DE MEMORIA ============

def get_gpu_memory_info() -> Dict[str, float]:
    """Obtiene información de memoria GPU en GB."""
    try:
        import torch
        if torch.cuda.is_available():
            props = torch.cuda.get_device_properties(0)
            total = props.total_memory / (1024 ** 3)
            reserved = torch.cuda.memory_reserved(0) / (1024 ** 3)
            allocated = torch.cuda.memory_allocated(0) / (1024 ** 3)
            return {"total": total, "free": total - reserved, "used": allocated, "reserved": reserved}
    except Exception:
        pass
    return {"total": 0, "free": 0, "used": 0, "reserved": 0}


def clear_cuda_cache(aggressive: bool = False) -> None:
    """Limpia el cache de CUDA."""
    try:
        import torch
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            if aggressive:
                for _ in range(3):
                    gc.collect()
                    torch.cuda.empty_cache()
    except Exception:
        pass


def check_memory_for_model(model_name: str) -> Tuple[bool, str]:
    """Verifica si hay memoria suficiente para el modelo."""
    required = MODEL_MEMORY_REQUIREMENTS.get(model_name, 2.0)
    gpu_info = get_gpu_memory_info()

    if gpu_info["total"] == 0:
        return False, "cpu"

    needed = required * 1.5
    if gpu_info["free"] >= needed:
        return True, "cuda"
    elif gpu_info["free"] >= required:
        return True, "cuda"
    else:
        logger.warning(f"Memoria GPU insuficiente para '{model_name}': {gpu_info['free']:.2f}GB libre, {required:.2f}GB necesario")
        return False, "cpu"


def get_cached_model(model_name: str, device: str) -> Optional[whisper.Whisper]:
    """Obtiene modelo desde cache global."""
    cache_key = f"{model_name}_{device}"
    if cache_key in _model_cache:
        model, cached_device, _ = _model_cache[cache_key]
        if cached_device == device:
            logger.info(f"Modelo '{model_name}' desde cache global")
            _model_cache[cache_key] = (model, cached_device, time.time())
            return model
        del _model_cache[cache_key]
    return None


def cache_model(model_name: str, model: whisper.Whisper, device: str) -> None:
    """Almacena modelo en cache global."""
    cache_key = f"{model_name}_{device}"
    _model_cache[cache_key] = (model, device, time.time())
    logger.info(f"Modelo '{model_name}' cacheado en {device}")


def clear_model_cache() -> None:
    """Limpia todo el cache de modelos."""
    global _model_cache
    for cache_key, (model, _, _) in list(_model_cache.items()):
        try:
            del model
        except Exception:
            pass
    _model_cache.clear()
    clear_cuda_cache(aggressive=True)


# ============ EXCEPCIONES ============

class AudioProcessingError(Exception):
    """Error específico para fallos en el procesamiento de audio."""
    pass


class TranscriptionTimeoutError(AudioProcessingError):
    """Error cuando la transcripción excede el tiempo máximo."""
    pass


class GPUOutOfMemoryError(AudioProcessingError):
    """Error específico para CUDA OOM."""
    pass


class AudioValidationError(AudioProcessingError):
    """Error cuando el archivo de audio no pasa las validaciones."""
    pass


# ============ PROCESADOR DE AUDIO ============

class AudioProcessor:
    """Procesador de audio con cache global y fallback automático."""

    SUPPORTED_MODELS = ("tiny", "base", "small", "medium", "large")
    DEFAULT_MODEL = settings.WHISPER_MODEL
    DEFAULT_LANGUAGE = "es"

    def __init__(
        self,
        model_name: Optional[str] = None,
        language: Optional[str] = None,
        device: Optional[Literal["cuda", "rocm", "cpu", "auto"]] = None,
    ) -> None:
        self._model_name = model_name or settings.WHISPER_MODEL
        self._language = language or self.DEFAULT_LANGUAGE
        self._device = device or "auto"
        self._model: Optional[whisper.Whisper] = None
        self._using_cpu_fallback = False
        self._model_id = f"whisper_{self._model_name}"

        if self._model_name not in self.SUPPORTED_MODELS:
            raise ValueError(
                f"Modelo '{self._model_name}' no soportado. "
                f"Disponibles: {', '.join(self.SUPPORTED_MODELS)}"
            )

        logger.info(
            "AudioProcessor inicializado",
            extra={"model": self._model_name, "device": self._device},
        )

    @property
    def model_name(self) -> str:
        return self._model_name

    @property
    def language(self) -> str:
        return self._language

    @property
    def device(self) -> str:
        return getattr(self, "_resolved_device", self._device)

    @property
    def is_loaded(self) -> bool:
        return self._model is not None

    def _validate_audio_file(self, audio_path: Path) -> dict:
        """Valida el archivo de audio."""
        logger.info(f"Validando: {audio_path.name}")

        file_size = audio_path.stat().st_size
        if file_size < 1024:
            raise AudioValidationError("Archivo demasiado pequeño")
        if file_size > 500 * 1024 * 1024:
            logger.warning(f"Archivo grande: {file_size / 1024 / 1024:.1f}MB")

        try:
            cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
                   "-show_entries", "stream=channels,sample_rate,codec_name",
                   "-of", "json", str(audio_path)]
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)

            if result.returncode == 0:
                import json
                info = json.loads(result.stdout)
                duration = float(info.get("format", {}).get("duration", 0))

                for stream in info.get("streams", []):
                    if stream.get("codec_type") == "audio":
                        return {
                            "duration": duration,
                            "sample_rate": int(stream.get("sample_rate", 16000)),
                            "channels": int(stream.get("channels", 1)),
                            "codec": stream.get("codec_name", "unknown"),
                            "size_bytes": file_size,
                        }
        except Exception:
            pass

        return {"duration": 0, "sample_rate": 16000, "channels": 1,
                "codec": "unknown", "size_bytes": file_size}

    def _convert_audio_with_ffmpeg(self, input_path: Path, output_format: str = "wav") -> Path:
        """Convierte audio usando ffmpeg."""
        suffix = f".{output_format}"
        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
            output_path = Path(tmp.name)

        cmd = ["ffmpeg", "-i", str(input_path),
               "-acodec", "pcm_s16le" if output_format == "wav" else "libmp3lame",
               "-ar", "16000", "-ac", "1", "-y", str(output_path)]

        result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)

        if result.returncode != 0 or not output_path.exists():
            raise AudioProcessingError(f"ffmpeg falló: {result.stderr[-500:] if result.stderr else 'Unknown'}")

        return output_path

    def _get_device_with_memory_check(self) -> str:
        """Detecta dispositivo verificando memoria disponible."""
        if self._device == "cpu":
            return "cpu"

        if self._device == "auto":
            has_memory, recommended = check_memory_for_model(self._model_name)

            if has_memory and recommended == "cuda":
                try:
                    import torch
                    if torch.cuda.is_available():
                        logger.info(f"GPU detectada: {torch.cuda.get_device_name(0)}")
                        return "cuda"
                except ImportError:
                    pass

            if not has_memory:
                logger.warning("Usando CPU por falta de memoria GPU")
                self._using_cpu_fallback = True
            return "cpu"

        return self._device

    def _load_model(self, force_reload: bool = False) -> None:
        """Carga modelo usando cache global con optimizaciones de memoria."""
        if self._model is not None and not force_reload:
            return

        # Configurar PyTorch para mejor gestión de memoria
        import os
        os.environ['PYTORCH_ALLOC_CONF'] = 'expandable_segments:True'

        clear_cuda_cache(aggressive=True)
        self._resolved_device = self._get_device_with_memory_check()

        # Verificar cache global
        if not force_reload:
            cached = get_cached_model(self._model_name, self._resolved_device)
            if cached is not None:
                self._model = cached
                return

        try:
            # Cargar modelo con menos memoria inicial
            # Primero cargar en RAM, luego mover a GPU
            import torch
            with torch.cuda.device(self._resolved_device):
                self._model = whisper.load_model(
                    self._model_name,
                    device=self._resolved_device,
                    download_root=None,
                    in_memory=True  # Reducir uso de disco
                )

            # Limpiar cache después de cargar
            torch.cuda.empty_cache()

            cache_model(self._model_name, self._model, self._resolved_device)

            gpu_info = get_gpu_memory_info()
            logger.info(
                f"Modelo cargado en {self._resolved_device}",
                extra={"gpu_used_gb": round(gpu_info.get("used", 0), 2),
                       "gpu_free_gb": round(gpu_info.get("free", 0), 2)},
            )
            vram_manager.update_usage(self._model_id)

        except RuntimeError as e:
            error_str = str(e)
            if "out of memory" in error_str.lower():
                # NUNCA usar CPU - limpiar GPU y reintentar
                logger.error(f"OOM en GPU - limpiando memoria para reintentar...")
                clear_cuda_cache(aggressive=True)
                raise AudioProcessingError(f"CUDA OOM - limpie la GPU y reintente. {error_str}") from e
            else:
                raise AudioProcessingError(f"Error cargando modelo: {e}") from e
        except Exception as e:
            raise AudioProcessingError(f"Error cargando modelo: {e}") from e

    def _transcribe_internal(self, audio_path: Path, audio_properties: dict) -> str:
        """Ejecuta la transcripción real."""
        result = self._model.transcribe(
            str(audio_path),
            language=self._language,
            fp16=self._resolved_device in ("cuda", "rocm"),
            verbose=False,
        )
        return result.get("text", "").strip()

    def transcribe(self, audio_path: str) -> str:
        """Transcribe un archivo de audio."""
        audio_file = Path(audio_path)
        if not audio_file.exists():
            raise FileNotFoundError(f"Archivo no encontrado: {audio_path}")

        vram_manager.update_usage(self._model_id)

        try:
            audio_properties = self._validate_audio_file(audio_file)
        except AudioValidationError as e:
            logger.error(f"Validación falló: {e}")
            raise

        converted_file: Optional[Path] = None
        last_error: Optional[Exception] = None

        for attempt in range(MAX_RETRY_ATTEMPTS):
            try:
                force_reload = attempt > 0
                if self._model is None or force_reload:
                    self._load_model(force_reload=force_reload)

                audio_to_transcribe = audio_file
                cleanup_converted = False

                needs_conversion = (
                    audio_file.suffix.lower() not in {".wav", ".mp3"} or
                    audio_properties.get("codec") in ("aac", "opus", "vorbis") or
                    audio_properties.get("channels", 1) > 1
                )

                if needs_conversion:
                    try:
                        converted_file = self._convert_audio_with_ffmpeg(audio_file, "wav")
                        audio_to_transcribe = converted_file
                        cleanup_converted = True
                    except AudioProcessingError as e:
                        logger.warning(f"Conversión falló: {e}")

                logger.info(
                    f"Transcribiendo: {audio_file.name}",
                    extra={"device": self._resolved_device, "cpu_fallback": self._using_cpu_fallback},
                )

                with ThreadPoolExecutor(max_workers=1) as executor:
                    future = executor.submit(self._transcribe_internal, audio_to_transcribe, audio_properties)
                    try:
                        text = future.result(timeout=TRANSCRIPTION_TIMEOUT_SECONDS)
                    except FutureTimeoutError:
                        self.unload()
                        raise TranscriptionTimeoutError(f"Timeout después de {TRANSCRIPTION_TIMEOUT_SECONDS}s")

                logger.info(f"Transcripción completada: {len(text)} caracteres")
                return text

            except RuntimeError as e:
                error_str = str(e)
                last_error = e

                if "out of memory" in error_str.lower():
                    logger.warning("OOM durante transcripción...")
                    clear_cuda_cache(aggressive=True)

                    if not self._using_cpu_fallback and self._resolved_device in ("cuda", "rocm"):
                        self.unload()
                        self._resolved_device = "cpu"
                        self._using_cpu_fallback = True
                        self._load_model()
                        continue

                    if attempt >= MAX_RETRY_ATTEMPTS - 1:
                        raise GPUOutOfMemoryError("Memoria GPU insuficiente") from e
                    time.sleep(RETRY_DELAY_SECONDS)
                    continue

                if "Key and Value must have the same sequence length" in error_str:
                    if not converted_file:
                        converted_file = self._convert_audio_with_ffmpeg(audio_file, "wav")
                        text = self._model.transcribe(
                            str(converted_file), language=self._language,
                            fp16=self._resolved_device in ("cuda", "rocm"), verbose=False
                        ).get("text", "").strip()
                        converted_file.unlink()
                        return text

                raise AudioProcessingError(f"Error de transcripción: {e}") from e

            except (TranscriptionTimeoutError, GPUOutOfMemoryError):
                raise
            except Exception as e:
                last_error = e
                self.unload()

                if attempt >= MAX_RETRY_ATTEMPTS - 1:
                    raise AudioProcessingError(f"Error después de {MAX_RETRY_ATTEMPTS} intentos: {e}") from e

                time.sleep(RETRY_DELAY_SECONDS)

            finally:
                if converted_file and converted_file.exists():
                    try:
                        converted_file.unlink()
                    except Exception:
                        pass

        raise AudioProcessingError(f"Error al transcribir: {last_error}") from last_error

    def unload(self) -> None:
        """Descarga la referencia local del modelo."""
        if self._model is not None:
            self._model = None
            clear_cuda_cache(aggressive=False)
            vram_manager.unregister_model(self._model_id)

    def __repr__(self) -> str:
        return f"AudioProcessor(model='{self._model_name}', device='{self.device}', loaded={self.is_loaded})"

    def __del__(self) -> None:
        try:
            self.unload()
        except Exception:
            pass