94 lines
2.9 KiB
Python
94 lines
2.9 KiB
Python
"""
|
|
Audio file processor using Whisper
|
|
"""
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, Any
|
|
from ..core import FileProcessingError
|
|
from ..config import settings
|
|
from ..services import vram_manager
|
|
from ..services.gpu_detector import gpu_detector
|
|
from .base_processor import FileProcessor
|
|
|
|
try:
|
|
import whisper
|
|
import torch
|
|
WHISPER_AVAILABLE = True
|
|
except ImportError:
|
|
WHISPER_AVAILABLE = False
|
|
|
|
|
|
class AudioProcessor(FileProcessor):
|
|
"""Processor for audio files using Whisper"""
|
|
|
|
def __init__(self):
|
|
super().__init__("AudioProcessor")
|
|
self.logger = logging.getLogger(__name__)
|
|
self._model = None
|
|
self._model_name = "medium" # Optimized for Spanish
|
|
|
|
def can_process(self, file_path: str) -> bool:
|
|
"""Check if file is an audio file"""
|
|
ext = self.get_file_extension(file_path)
|
|
return ext in settings.AUDIO_EXTENSIONS
|
|
|
|
def _load_model(self):
|
|
"""Load Whisper model lazily"""
|
|
if not WHISPER_AVAILABLE:
|
|
raise FileProcessingError("Whisper not installed")
|
|
|
|
if self._model is None:
|
|
device = gpu_detector.get_device()
|
|
self.logger.info(f"Loading Whisper model: {self._model_name} on {device}")
|
|
self._model = whisper.load_model(self._model_name, device=device)
|
|
vram_manager.update_usage()
|
|
|
|
def process(self, file_path: str) -> Dict[str, Any]:
|
|
"""Transcribe audio file"""
|
|
self.validate_file(file_path)
|
|
audio_path = Path(file_path)
|
|
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{audio_path.stem}.txt"
|
|
|
|
self.logger.info(f"Processing audio file: {audio_path}")
|
|
|
|
try:
|
|
# Load model if needed
|
|
self._load_model()
|
|
|
|
# Update VRAM usage
|
|
vram_manager.update_usage()
|
|
|
|
# Transcribe with torch.no_grad() for memory efficiency
|
|
with torch.inference_mode():
|
|
result = self._model.transcribe(
|
|
str(audio_path),
|
|
language="es",
|
|
fp16=True,
|
|
verbose=False
|
|
)
|
|
|
|
# Save transcription
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(result["text"])
|
|
|
|
self.logger.info(f"Transcription completed: {output_path}")
|
|
|
|
return {
|
|
"success": True,
|
|
"transcription_path": str(output_path),
|
|
"text": result["text"],
|
|
"model_used": self._model_name
|
|
}
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Audio processing failed: {e}")
|
|
raise FileProcessingError(f"Audio processing failed: {e}")
|
|
|
|
def cleanup(self) -> None:
|
|
"""Cleanup model"""
|
|
if self._model is not None:
|
|
del self._model
|
|
self._model = None
|
|
vram_manager.cleanup()
|