Files
ableton-mcp-ai/AbletonMCP_AI/MCP_Server/audio_analyzer.py
renato97 6ec8663954 Initial commit: AbletonMCP-AI complete system
- MCP Server with audio fallback, sample management
- Song generator with bus routing
- Reference listener and audio resampler
- Vector-based sample search
- Master chain with limiter and calibration
- Fix: Audio fallback now works without M4L
- Fix: Full song detection in sample loader

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-28 22:53:10 -03:00

682 lines
22 KiB
Python

"""
audio_analyzer.py - Análisis de audio para detección de Key y BPM
Proporciona análisis básico de archivos de audio para extraer:
- BPM (tempo) mediante detección de onset y autocorrelación
- Key (tonalidad) mediante análisis de cromagrama
- Características espectrales para clasificación
"""
import os
import logging
import numpy as np
import subprocess
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, List
from dataclasses import dataclass
from enum import Enum
logger = logging.getLogger("AudioAnalyzer")
# Constantes musicales
NOTE_NAMES = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
KEY_PROFILES = {
# Perfiles de Krumhansl-Schmuckler para detección de tonalidad
'major': [6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88],
'minor': [6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17]
}
CIRCLE_OF_FIFTHS_MAJOR = ['C', 'G', 'D', 'A', 'E', 'B', 'F#', 'C#', 'G#', 'D#', 'A#', 'F']
CIRCLE_OF_FIFTHS_MINOR = ['Am', 'Em', 'Bm', 'F#m', 'C#m', 'G#m', 'D#m', 'A#m', 'Fm', 'Cm', 'Gm', 'Dm']
class SampleType(Enum):
"""Tipos de samples musicales"""
KICK = "kick"
SNARE = "snare"
CLAP = "clap"
HAT_CLOSED = "hat_closed"
HAT_OPEN = "hat_open"
HAT = "hat"
PERC = "perc"
SHAKER = "shaker"
TOM = "tom"
CRASH = "crash"
RIDE = "ride"
BASS = "bass"
SYNTH = "synth"
PAD = "pad"
LEAD = "lead"
PLUCK = "pluck"
ARP = "arp"
CHORD = "chord"
STAB = "stab"
VOCAL = "vocal"
FX = "fx"
LOOP = "loop"
AMBIENCE = "ambience"
UNKNOWN = "unknown"
@dataclass
class AudioFeatures:
"""Características extraídas de un archivo de audio"""
bpm: Optional[float]
key: Optional[str]
key_confidence: float
duration: float
sample_rate: int
sample_type: SampleType
spectral_centroid: float
spectral_rolloff: float
zero_crossing_rate: float
rms_energy: float
is_harmonic: bool
is_percussive: bool
suggested_genres: List[str]
class AudioAnalyzer:
"""
Analizador de audio para samples musicales.
Soporta múltiples backends:
- librosa (recomendado, más preciso)
- basic (fallback sin dependencias externas, basado en nombre de archivo)
"""
def __init__(self, backend: str = "auto"):
"""
Inicializa el analizador de audio.
Args:
backend: 'librosa', 'basic', o 'auto' (detecta automáticamente)
"""
self.backend = backend
self._librosa_available = False
self._soundfile_available = False
if backend in ("auto", "librosa"):
self._check_librosa()
if self._librosa_available:
logger.info("Usando backend: librosa")
else:
logger.info("Usando backend: basic (análisis por nombre de archivo)")
def _check_librosa(self):
"""Verifica si librosa está disponible"""
try:
import librosa
import soundfile as sf
self._librosa_available = True
self._soundfile_available = True
self.librosa = librosa
self.sf = sf
except ImportError:
self._librosa_available = False
self._soundfile_available = False
def analyze(self, file_path: str) -> AudioFeatures:
"""
Analiza un archivo de audio y extrae características.
Args:
file_path: Ruta al archivo de audio
Returns:
AudioFeatures con los datos extraídos
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
# Intentar análisis con librosa si está disponible
if self._librosa_available:
try:
return self._analyze_with_librosa(file_path)
except Exception as e:
logger.warning(f"Error con librosa: {e}, usando análisis básico")
# Fallback a análisis básico
return self._analyze_basic(file_path)
def _analyze_with_librosa(self, file_path: str) -> AudioFeatures:
"""Análisis completo usando librosa"""
# Cargar audio
y, sr = self.librosa.load(file_path, sr=None, mono=True)
# Duración
duration = self.librosa.get_duration(y=y, sr=sr)
# Detectar BPM
tempo, _ = self.librosa.beat.beat_track(y=y, sr=sr)
bpm = float(tempo) if isinstance(tempo, (int, float, np.number)) else None
# Análisis espectral
spectral_centroids = self.librosa.feature.spectral_centroid(y=y, sr=sr)[0]
spectral_rolloffs = self.librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
zcr = self.librosa.feature.zero_crossing_rate(y)[0]
rms = self.librosa.feature.rms(y=y)[0]
# Detectar key
key, key_confidence = self._detect_key_librosa(y, sr)
# Clasificación percusivo vs armónico
is_percussive = self._is_percussive(y, sr)
is_harmonic = not is_percussive and duration > 1.0
# Determinar tipo de sample
sample_type = self._classify_sample_type(
file_path, is_percussive, is_harmonic, duration,
float(np.mean(spectral_centroids)), float(np.mean(rms))
)
# Sugerir géneros
suggested_genres = self._suggest_genres(sample_type, bpm, key)
return AudioFeatures(
bpm=bpm,
key=key,
key_confidence=key_confidence,
duration=duration,
sample_rate=sr,
sample_type=sample_type,
spectral_centroid=float(np.mean(spectral_centroids)),
spectral_rolloff=float(np.mean(spectral_rolloffs)),
zero_crossing_rate=float(np.mean(zcr)),
rms_energy=float(np.mean(rms)),
is_harmonic=is_harmonic,
is_percussive=is_percussive,
suggested_genres=suggested_genres
)
def _detect_key_librosa(self, y: np.ndarray, sr: int) -> Tuple[Optional[str], float]:
"""
Detecta la tonalidad usando cromagrama y correlación con perfiles.
"""
try:
# Calcular cromagrama
chroma = self.librosa.feature.chroma_stft(y=y, sr=sr)
chroma_avg = np.mean(chroma, axis=1)
# Normalizar
chroma_avg = chroma_avg / (np.sum(chroma_avg) + 1e-10)
best_key = None
best_score = -np.inf
best_mode = None
# Probar todas las tonalidades mayores y menores
for mode, profile in KEY_PROFILES.items():
for i in range(12):
# Rotar el perfil
rotated_profile = np.roll(profile, i)
# Correlación
score = np.corrcoef(chroma_avg, rotated_profile)[0, 1]
if score > best_score:
best_score = score
best_mode = mode
best_key = NOTE_NAMES[i]
# Formatear resultado
if best_key:
if best_mode == 'minor':
best_key = best_key + 'm'
confidence = max(0.0, min(1.0, (best_score + 1) / 2))
return best_key, confidence
except Exception as e:
logger.warning(f"Error detectando key: {e}")
return None, 0.0
def _is_percussive(self, y: np.ndarray, sr: int) -> bool:
"""
Determina si un sonido es principalmente percusivo.
"""
try:
# Separar componentes armónicos y percusivos
y_harmonic, y_percussive = self.librosa.effects.hpss(y)
# Calcular energía relativa
energy_harmonic = np.sum(y_harmonic ** 2)
energy_percussive = np.sum(y_percussive ** 2)
total_energy = energy_harmonic + energy_percussive
if total_energy > 0:
percussive_ratio = energy_percussive / total_energy
return percussive_ratio > 0.6
except Exception as e:
logger.warning(f"Error en separación HPSS: {e}")
# Fallback: usar duración como heurística
duration = len(y) / sr
return duration < 0.5
def _analyze_basic(self, file_path: str) -> AudioFeatures:
"""
Análisis básico sin dependencias externas.
Usa metadatos del archivo y nombre para inferir características.
"""
path = Path(file_path)
name = path.stem
# Extraer del nombre
bpm = self._extract_bpm_from_name(name)
key = self._extract_key_from_name(name)
# Estimar duración del archivo
duration = self._estimate_duration(file_path)
# Clasificar por nombre
sample_type = self._classify_by_name(name)
# Determinar características por tipo
is_percussive = sample_type in [
SampleType.KICK, SampleType.SNARE, SampleType.CLAP,
SampleType.HAT, SampleType.HAT_CLOSED, SampleType.HAT_OPEN,
SampleType.PERC, SampleType.SHAKER, SampleType.TOM,
SampleType.CRASH, SampleType.RIDE
]
is_harmonic = sample_type in [
SampleType.BASS, SampleType.SYNTH, SampleType.PAD,
SampleType.LEAD, SampleType.PLUCK, SampleType.CHORD,
SampleType.VOCAL
]
# Valores por defecto basados en tipo
spectral_centroid = 5000.0 if is_percussive else 1000.0
rms_energy = 0.5
suggested_genres = self._suggest_genres(sample_type, bpm, key)
return AudioFeatures(
bpm=bpm,
key=key,
key_confidence=0.7 if key else 0.0,
duration=duration,
sample_rate=44100,
sample_type=sample_type,
spectral_centroid=spectral_centroid,
spectral_rolloff=spectral_centroid * 2,
zero_crossing_rate=0.1 if is_harmonic else 0.3,
rms_energy=rms_energy,
is_harmonic=is_harmonic,
is_percussive=is_percussive,
suggested_genres=suggested_genres
)
def _estimate_duration(self, file_path: str) -> float:
"""Estima la duración del archivo de audio"""
try:
import wave
ext = Path(file_path).suffix.lower()
if ext == '.wav':
with wave.open(file_path, 'rb') as wav:
frames = wav.getnframes()
rate = wav.getframerate()
return frames / float(rate)
elif ext in ('.mp3', '.ogg', '.flac', '.aif', '.aiff', '.m4a'):
windows_duration = self._estimate_duration_with_windows_shell(file_path)
if windows_duration > 0:
return windows_duration
# Estimación por tamaño de archivo
size = os.path.getsize(file_path)
# Aproximación: ~176KB por segundo para CD quality stereo
return size / (176.4 * 1024)
except Exception as e:
logger.warning(f"Error estimando duración: {e}")
return 0.0
def _estimate_duration_with_windows_shell(self, file_path: str) -> float:
"""Obtiene la duración usando metadatos del shell de Windows cuando están disponibles."""
if os.name != 'nt':
return 0.0
safe_path = file_path.replace("'", "''")
powershell_command = (
f"$path = '{safe_path}'; "
"$shell = New-Object -ComObject Shell.Application; "
"$folder = $shell.Namespace((Split-Path $path)); "
"$file = $folder.ParseName((Split-Path $path -Leaf)); "
"$duration = $folder.GetDetailsOf($file, 27); "
"Write-Output $duration"
)
try:
result = subprocess.run(
f'powershell -NoProfile -Command "{powershell_command}"',
capture_output=True,
text=True,
timeout=5,
check=False,
shell=True,
)
value = (result.stdout or "").strip()
if not value:
return 0.0
parts = value.split(':')
if len(parts) == 3:
return (int(parts[0]) * 3600) + (int(parts[1]) * 60) + float(parts[2])
return 0.0
except Exception:
return 0.0
def _extract_bpm_from_name(self, name: str) -> Optional[float]:
"""Extrae BPM del nombre del archivo"""
import re
patterns = [
r'[_\s\-](\d{2,3})\s*BPM',
r'[_\s\-](\d{2,3})[_\s\-]',
r'(\d{2,3})bpm',
r'[_\s\-](\d{2,3})\s*(?:BPM|bpm)?\s*(?:\.wav|\.mp3|\.aif)',
]
for pattern in patterns:
match = re.search(pattern, name, re.IGNORECASE)
if match:
bpm = int(match.group(1))
if 60 <= bpm <= 200:
return float(bpm)
return None
def _extract_key_from_name(self, name: str) -> Optional[str]:
"""Extrae key del nombre del archivo"""
import re
patterns = [
r'[_\s\-]([A-G][#b]?(?:m|min|minor)?)[_\s\-]',
r'\bin\s+([A-G][#b]?(?:m|min|minor)?)\b',
r'Key\s+([A-G][#b]?(?:m|min|minor)?)',
r'[_\s\-]([A-G][#b]?)\s*(?:maj|major)?[_\s\-]',
]
for pattern in patterns:
match = re.search(pattern, name, re.IGNORECASE)
if match:
key = match.group(1)
# Normalizar
key = key.replace('b', '#').replace('Db', 'C#').replace('Eb', 'D#')
key = key.replace('Gb', 'F#').replace('Ab', 'G#').replace('Bb', 'A#')
# Detectar si es menor
is_minor = 'm' in key.lower() or 'min' in key.lower()
key = key.replace('min', '').replace('minor', '').replace('major', '')
key = key.rstrip('mM')
if is_minor:
key = key + 'm'
return key
return None
def _classify_sample_type(self, file_path: str, is_percussive: bool,
is_harmonic: bool, duration: float,
spectral_centroid: float, rms: float) -> SampleType:
"""Clasifica el tipo de sample basado en características"""
# Primero intentar por nombre
sample_type = self._classify_by_name(Path(file_path).stem)
if sample_type != SampleType.UNKNOWN:
return sample_type
# Clasificación por características de audio
if is_percussive:
if duration < 0.1:
if spectral_centroid < 2000:
return SampleType.KICK
elif spectral_centroid > 8000:
return SampleType.HAT_CLOSED
else:
return SampleType.SNARE
elif duration < 0.3:
return SampleType.CLAP
else:
return SampleType.PERC
elif is_harmonic:
if spectral_centroid < 500:
return SampleType.BASS
elif duration > 4.0:
return SampleType.PAD
else:
return SampleType.SYNTH
return SampleType.UNKNOWN
def _classify_by_name(self, name: str) -> SampleType:
"""Clasifica el tipo de sample basado en su nombre"""
name_lower = name.lower()
# Mapeo de palabras clave a tipos
keywords = {
SampleType.KICK: ['kick', 'bd', 'bass drum', 'kickdrum', 'kik'],
SampleType.SNARE: ['snare', 'snr', 'sd', 'rim'],
SampleType.CLAP: ['clap', 'clp', 'handclap'],
SampleType.HAT_CLOSED: ['closed hat', 'closedhat', 'chh', 'closed'],
SampleType.HAT_OPEN: ['open hat', 'openhat', 'ohh', 'open'],
SampleType.HAT: ['hat', 'hihat', 'hi-hat', 'hh'],
SampleType.PERC: ['perc', 'percussion', 'conga', 'bongo', 'timb'],
SampleType.SHAKER: ['shaker', 'shake', 'tamb'],
SampleType.TOM: ['tom', 'tomtom'],
SampleType.CRASH: ['crash', 'cymbal'],
SampleType.RIDE: ['ride'],
SampleType.BASS: ['bass', 'bassline', 'sub', '808', 'reese'],
SampleType.SYNTH: ['synth', 'lead', 'arp', 'sequence'],
SampleType.PAD: ['pad', 'atmosphere', 'dron'],
SampleType.PLUCK: ['pluck'],
SampleType.CHORD: ['chord', 'stab'],
SampleType.VOCAL: ['vocal', 'vox', 'voice', 'speech', 'talk'],
SampleType.FX: ['fx', 'effect', 'sweep', 'riser', 'downlifter', 'impact', 'hit', 'noise'],
SampleType.LOOP: ['loop', 'full', 'groove'],
}
for sample_type, words in keywords.items():
for word in words:
if word in name_lower:
return sample_type
return SampleType.UNKNOWN
def _suggest_genres(self, sample_type: SampleType, bpm: Optional[float],
key: Optional[str]) -> List[str]:
"""Sugiere géneros musicales apropiados para el sample"""
genres = []
if bpm:
if 118 <= bpm <= 128:
genres.extend(['house', 'tech-house', 'deep-house'])
elif 124 <= bpm <= 132:
genres.extend(['tech-house', 'techno'])
elif 132 <= bpm <= 142:
genres.extend(['techno', 'peak-time-techno'])
elif 142 <= bpm <= 150:
genres.extend(['trance', 'hard-techno'])
elif 160 <= bpm <= 180:
genres.extend(['drum-and-bass', 'neurofunk'])
elif bpm < 118:
genres.extend(['downtempo', 'ambient', 'lo-fi'])
# Por tipo de sample
if sample_type in [SampleType.KICK, SampleType.SNARE, SampleType.CLAP]:
if not genres:
genres = ['techno', 'house']
elif sample_type == SampleType.BASS:
if not genres:
genres = ['techno', 'house', 'bass-music']
elif sample_type in [SampleType.SYNTH, SampleType.PAD]:
if not genres:
genres = ['trance', 'progressive', 'ambient']
return genres if genres else ['electronic']
def get_compatible_key(self, key: str, shift: int = 0) -> str:
"""
Obtiene una key compatible usando el círculo de quintas.
Args:
key: Key original (ej: 'Am', 'F#m')
shift: Desplazamiento en el círculo (+1 = quinta arriba, -1 = quinta abajo)
Returns:
Key resultante
"""
is_minor = key.endswith('m')
root = key.rstrip('m')
if root not in NOTE_NAMES:
return key
circle = CIRCLE_OF_FIFTHS_MINOR if is_minor else CIRCLE_OF_FIFTHS_MAJOR
try:
idx = circle.index(key)
new_idx = (idx + shift) % 12
return circle[new_idx]
except ValueError:
return key
def calculate_key_compatibility(self, key1: str, key2: str) -> float:
"""
Calcula la compatibilidad entre dos keys (0-1).
Usa el círculo de quintas: keys cercanas son más compatibles.
"""
if key1 == key2:
return 1.0
# Normalizar
def normalize(k):
is_minor = k.endswith('m')
root = k.rstrip('m')
# Convertir bemoles a sostenidos
root = root.replace('Db', 'C#').replace('Eb', 'D#')
root = root.replace('Gb', 'F#').replace('Ab', 'G#').replace('Bb', 'A#')
return root + ('m' if is_minor else '')
k1 = normalize(key1)
k2 = normalize(key2)
if k1 == k2:
return 1.0
# Verificar si son modos diferentes de la misma nota
if k1.rstrip('m') == k2.rstrip('m'):
return 0.8 # Mismo root, diferente modo
# Usar círculo de quintas
is_minor1 = k1.endswith('m')
is_minor2 = k2.endswith('m')
if is_minor1 != is_minor2:
return 0.3 # Diferente modo, baja compatibilidad
circle = CIRCLE_OF_FIFTHS_MINOR if is_minor1 else CIRCLE_OF_FIFTHS_MAJOR
try:
idx1 = circle.index(k1)
idx2 = circle.index(k2)
distance = min(abs(idx1 - idx2), 12 - abs(idx1 - idx2))
# Compatibilidad decrece con la distancia
compatibility = max(0.0, 1.0 - (distance * 0.2))
return compatibility
except ValueError:
return 0.0
# Instancia global
_analyzer: Optional[AudioAnalyzer] = None
def get_analyzer() -> AudioAnalyzer:
"""Obtiene la instancia global del analizador"""
global _analyzer
if _analyzer is None:
_analyzer = AudioAnalyzer()
return _analyzer
def analyze_sample(file_path: str) -> Dict[str, Any]:
"""
Función de conveniencia para analizar un sample.
Returns:
Diccionario con las características del sample
"""
analyzer = get_analyzer()
features = analyzer.analyze(file_path)
return {
'bpm': features.bpm,
'key': features.key,
'key_confidence': features.key_confidence,
'duration': features.duration,
'sample_rate': features.sample_rate,
'sample_type': features.sample_type.value,
'spectral_centroid': features.spectral_centroid,
'rms_energy': features.rms_energy,
'is_harmonic': features.is_harmonic,
'is_percussive': features.is_percussive,
'suggested_genres': features.suggested_genres,
}
def quick_analyze(file_path: str) -> Dict[str, Any]:
"""
Análisis rápido basado solo en el nombre del archivo.
No requiere dependencias externas.
"""
analyzer = AudioAnalyzer(backend="basic")
features = analyzer.analyze(file_path)
return {
'bpm': features.bpm,
'key': features.key,
'sample_type': features.sample_type.value,
'suggested_genres': features.suggested_genres,
}
# Testing
if __name__ == "__main__":
import sys
logging.basicConfig(level=logging.INFO)
if len(sys.argv) < 2:
print("Uso: python audio_analyzer.py <archivo_de_audio>")
sys.exit(1)
file_path = sys.argv[1]
print(f"\nAnalizando: {file_path}")
print("=" * 50)
try:
result = analyze_sample(file_path)
print("\nResultados:")
print(f" BPM: {result['bpm'] or 'No detectado'}")
print(f" Key: {result['key'] or 'No detectado'} (confianza: {result['key_confidence']:.2f})")
print(f" Duración: {result['duration']:.2f}s")
print(f" Tipo: {result['sample_type']}")
print(f" Géneros sugeridos: {', '.join(result['suggested_genres'])}")
print(f" Es percusivo: {result['is_percussive']}")
print(f" Es armónico: {result['is_harmonic']}")
except Exception as e:
print(f"Error: {e}")
sys.exit(1)