feat: Implement senior audio injection with 5 fallback methods

- Add _cmd_create_arrangement_audio_pattern with 5-method fallback chain
- Method 1: track.insert_arrangement_clip() [Live 12+]
- Method 2: track.create_audio_clip() [Live 11+]
- Method 3: arrangement_clips.add_new_clip() [Live 12+]
- Method 4: Session->duplicate_clip_to_arrangement [Legacy]
- Method 5: Session->Recording [Universal]

- Add _cmd_duplicate_clip_to_arrangement for session-to-arrangement workflow
- Update skills documentation
- Verified: 3 clips created at positions [0, 4, 8] in Arrangement View

Closes: Audio injection in Arrangement View
This commit is contained in:
OpenCode Agent
2026-04-12 14:02:32 -03:00
commit 5ce8187c65
118 changed files with 55075 additions and 0 deletions

View File

@@ -0,0 +1,635 @@
"""
Embedding Engine - Vector embeddings for audio samples
Crea embeddings vectoriales normalizados para samples usando features espectrales.
"""
import json
import os
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import numpy as np
# Intentar importar libreria_analyzer para integración
# Si no existe, funcionar independientemente
try:
from .libreria_analyzer import LibreriaAnalyzer, NOTE_TO_NUMBER
HAS_ANALYZER = True
except ImportError:
HAS_ANALYZER = False
NOTE_TO_NUMBER = {
'C': 0, 'C#': 1, 'Db': 1, 'D': 2, 'D#': 3, 'Eb': 3,
'E': 4, 'F': 5, 'F#': 6, 'Gb': 6, 'G': 7, 'G#': 8,
'Ab': 8, 'A': 9, 'A#': 10, 'Bb': 10, 'B': 11
}
class EmbeddingEngine:
"""
Motor de embeddings vectoriales para samples de audio.
Crea vectores de ~20 dimensiones combinando:
- BPM (normalizado)
- Key (convertido a número 0-11)
- RMS
- Spectral Centroid
- Spectral Rolloff
- Zero Crossing Rate
- MFCCs (13 coeficientes)
- Onset Strength
- Duration
Todos los embeddings son normalizados usando min-max scaling.
"""
EMBEDDING_DIM = 20 # 1 BPM + 1 Key + 1 RMS + 1 SC + 1 SR + 1 ZCR + 13 MFCCs + 1 OS + 1 Duration
EMBEDDINGS_FILE = Path("C:/ProgramData/Ableton/Live 12 Suite/Resources/MIDI Remote Scripts/libreria/reggaeton/.embeddings_index.json")
FEATURES_CACHE = Path("C:/ProgramData/Ableton/Live 12 Suite/Resources/MIDI Remote Scripts/libreria/reggaeton/.features_cache.json")
def __init__(self, features_data: Optional[Dict] = None):
"""
Inicializa el motor de embeddings.
Args:
features_data: Datos de features precargados (opcional)
"""
self.embeddings: Dict[str, np.ndarray] = {}
self.normalized_embeddings: Dict[str, np.ndarray] = {}
self.min_values: Optional[np.ndarray] = None
self.max_values: Optional[np.ndarray] = None
self.features_data = features_data or {}
# Cargar embeddings existentes si hay
self._load_embeddings()
def _key_to_number(self, key: str) -> float:
"""
Convierte una key musical (ej: 'C#m', 'F', 'Ab') a número 0-11.
Args:
key: Key en formato string (puede incluir 'm' para menor)
Returns:
float: Número de la key (0-11) o 0 si no se reconoce
"""
if not key or key == "":
return 0.0
# Limpiar (quitar espacios, 'm' de menor, números)
key_clean = key.strip().upper()
key_clean = key_clean.replace('M', '').replace('MINOR', '').replace('MAJOR', '')
key_clean = ''.join([c for c in key_clean if c.isalpha() or c == '#'])
# Extraer nota base (1-2 caracteres)
if len(key_clean) >= 2 and key_clean[1] in ['#', 'B']:
note = key_clean[:2]
else:
note = key_clean[:1] if key_clean else 'C'
return float(NOTE_TO_NUMBER.get(note, 0))
def _bpm_to_normalized(self, bpm: float) -> float:
"""
Normaliza BPM a rango 0-1 (asumiendo rango típico 60-200).
Args:
bpm: BPM del sample
Returns:
float: BPM normalizado (0-1)
"""
if bpm <= 0:
return 0.5 # Valor neutral si no hay BPM
# Rango típico de música electrónica: 60-200 BPM
min_bpm, max_bpm = 60.0, 200.0
normalized = (bpm - min_bpm) / (max_bpm - min_bpm)
return np.clip(normalized, 0.0, 1.0)
def create_embedding(self, features: Dict) -> np.ndarray:
"""
Crea un vector de embedding de ~20 dimensiones a partir de features.
Args:
features: Diccionario con features del sample
Returns:
np.ndarray: Vector de embedding (20 dimensiones)
"""
embedding = np.zeros(self.EMBEDDING_DIM, dtype=np.float32)
# 1. BPM normalizado (índice 0)
bpm = features.get('bpm', 0)
embedding[0] = self._bpm_to_normalized(bpm)
# 2. Key convertida a número (índice 1)
key = features.get('key', '')
embedding[1] = self._key_to_number(key) / 11.0 # Normalizar 0-1
# 3. RMS (índice 2) - ya viene en dB, normalizar -60 a 0 dB
rms = features.get('rms', -30)
embedding[2] = np.clip((rms - (-60)) / 60.0, 0.0, 1.0)
# 4. Spectral Centroid (índice 3) - normalizar 0-10000 Hz
sc = features.get('spectral_centroid', 2000)
embedding[3] = np.clip(sc / 10000.0, 0.0, 1.0)
# 5. Spectral Rolloff (índice 4) - normalizar 0-20000 Hz
sr = features.get('spectral_rolloff', 8000)
embedding[4] = np.clip(sr / 20000.0, 0.0, 1.0)
# 6. Zero Crossing Rate (índice 5) - ya está en 0-1
zcr = features.get('zero_crossing_rate', 0.1)
embedding[5] = np.clip(zcr, 0.0, 1.0)
# 7-19. MFCCs (13 coeficientes) - índices 6-18
mfccs = features.get('mfccs', [0] * 13)
if len(mfccs) < 13:
mfccs = list(mfccs) + [0] * (13 - len(mfccs))
# Los MFCCs típicamente están en rango -100 a 100, normalizar
for i in range(13):
embedding[6 + i] = np.clip((mfccs[i] + 100) / 200.0, 0.0, 1.0)
# 20. Onset Strength (índice 19) - ya está en 0-1 típicamente
onset = features.get('onset_strength', 0.5)
embedding[19] = np.clip(onset, 0.0, 1.0)
# 21. Duration (índice 20, pero no hay espacio... incluir en índice 0?)
# Reemplazar: usar índice 0 como duración normalizada en lugar de BPM
# o expandir dimensión... vamos a usar índice 0 como duración
# y mover BPM al final si hay espacio
# Ajuste: usar los primeros valores de forma diferente
# Recalcular con ajuste:
# 0: Duration, 1: BPM, 2: Key, 3: RMS, 4: SC, 5: SR, 6: ZCR, 7-19: MFCCs
duration = features.get('duration', 1.0)
embedding = np.zeros(self.EMBEDDING_DIM, dtype=np.float32)
embedding[0] = np.clip(duration / 10.0, 0.0, 1.0) # Normalizar 0-10 segundos
embedding[1] = self._bpm_to_normalized(bpm)
embedding[2] = self._key_to_number(key) / 11.0
embedding[3] = np.clip((rms - (-60)) / 60.0, 0.0, 1.0)
embedding[4] = np.clip(sc / 10000.0, 0.0, 1.0)
embedding[5] = np.clip(sr / 20000.0, 0.0, 1.0)
embedding[6] = np.clip(zcr, 0.0, 1.0)
# MFCCs en índices 7-19 (13 coeficientes)
for i in range(13):
if i < len(mfccs):
embedding[7 + i] = np.clip((mfccs[i] + 100) / 200.0, 0.0, 1.0)
else:
embedding[7 + i] = 0.5
return embedding
def normalize_embeddings(self) -> None:
"""
Normaliza todos los embeddings usando min-max scaling.
Cada dimensión se escala independientemente al rango [0, 1].
"""
if not self.embeddings:
return
# Convertir a matriz numpy
paths = list(self.embeddings.keys())
matrix = np.array([self.embeddings[p] for p in paths], dtype=np.float32)
# Calcular min y max por dimensión
self.min_values = matrix.min(axis=0)
self.max_values = matrix.max(axis=0)
# Evitar división por cero
ranges = self.max_values - self.min_values
ranges[ranges == 0] = 1.0
# Normalizar
normalized_matrix = (matrix - self.min_values) / ranges
# Guardar embeddings normalizados
self.normalized_embeddings = {
path: normalized_matrix[i]
for i, path in enumerate(paths)
}
def build_from_features(self, features_data: Optional[Dict] = None) -> None:
"""
Construye embeddings a partir de datos de features.
Args:
features_data: Diccionario con features de samples
"""
if features_data is None:
features_data = self.features_data
if not features_data or 'samples' not in features_data:
# Intentar cargar desde archivo
if self.FEATURES_CACHE.exists():
with open(self.FEATURES_CACHE, 'r') as f:
features_data = json.load(f)
if not features_data or 'samples' not in features_data:
print("[EmbeddingEngine] No features data available")
return
samples = features_data.get('samples', {})
print(f"[EmbeddingEngine] Building embeddings for {len(samples)} samples...")
self.embeddings = {}
for path, features in samples.items():
try:
embedding = self.create_embedding(features)
self.embeddings[path] = embedding
except Exception as e:
print(f"[EmbeddingEngine] Error creating embedding for {path}: {e}")
# Normalizar
self.normalize_embeddings()
print(f"[EmbeddingEngine] Created {len(self.embeddings)} embeddings")
def save_embeddings(self) -> None:
"""
Guarda los embeddings normalizados en archivo JSON.
"""
if not self.normalized_embeddings:
print("[EmbeddingEngine] No embeddings to save")
return
# Serializar embeddings como listas
data = {
'version': '1.0',
'dimensions': self.EMBEDDING_DIM,
'total_samples': len(self.normalized_embeddings),
'created_at': str(np.datetime64('now')),
'min_values': self.min_values.tolist() if self.min_values is not None else None,
'max_values': self.max_values.tolist() if self.max_values is not None else None,
'embeddings': {
path: embedding.tolist()
for path, embedding in self.normalized_embeddings.items()
}
}
# Asegurar que existe el directorio
self.EMBEDDINGS_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(self.EMBEDDINGS_FILE, 'w') as f:
json.dump(data, f, indent=2)
print(f"[EmbeddingEngine] Saved {len(self.normalized_embeddings)} embeddings to {self.EMBEDDINGS_FILE}")
def _load_embeddings(self) -> bool:
"""
Carga embeddings desde archivo si existe.
Returns:
bool: True si se cargaron exitosamente
"""
if not self.EMBEDDINGS_FILE.exists():
return False
try:
with open(self.EMBEDDINGS_FILE, 'r') as f:
data = json.load(f)
self.EMBEDDING_DIM = data.get('dimensions', 20)
self.min_values = np.array(data.get('min_values')) if data.get('min_values') else None
self.max_values = np.array(data.get('max_values')) if data.get('max_values') else None
self.normalized_embeddings = {
path: np.array(emb, dtype=np.float32)
for path, emb in data.get('embeddings', {}).items()
}
self.embeddings = self.normalized_embeddings.copy()
print(f"[EmbeddingEngine] Loaded {len(self.normalized_embeddings)} embeddings from cache")
return True
except Exception as e:
print(f"[EmbeddingEngine] Error loading embeddings: {e}")
return False
def cosine_distance(self, emb1: np.ndarray, emb2: np.ndarray) -> float:
"""
Calcula la distancia coseno entre dos embeddings.
Args:
emb1: Primer embedding
emb2: Segundo embedding
Returns:
float: Distancia coseno (0 = idénticos, 1 = opuestos)
"""
# Normalizar vectores
norm1 = np.linalg.norm(emb1)
norm2 = np.linalg.norm(emb2)
if norm1 == 0 or norm2 == 0:
return 1.0
similarity = np.dot(emb1, emb2) / (norm1 * norm2)
# Convertir a distancia (0 = similar, 1 = diferente)
return 1.0 - np.clip(similarity, -1.0, 1.0)
def euclidean_distance(self, emb1: np.ndarray, emb2: np.ndarray) -> float:
"""
Calcula la distancia euclidiana entre dos embeddings.
Args:
emb1: Primer embedding
emb2: Segundo embedding
Returns:
float: Distancia euclidiana normalizada
"""
diff = emb1 - emb2
return np.sqrt(np.sum(diff ** 2)) / np.sqrt(self.EMBEDDING_DIM)
def find_similar(self, sample_path: str, top_n: int = 10,
use_cosine: bool = True) -> List[Tuple[str, float]]:
"""
Encuentra los samples más similares a un sample dado.
Args:
sample_path: Ruta del sample de referencia
top_n: Número de resultados a retornar
use_cosine: True para usar distancia coseno, False para euclidiana
Returns:
List[Tuple[str, float]]: Lista de (path, distancia) ordenada por similitud
"""
if not self.normalized_embeddings:
print("[EmbeddingEngine] No embeddings available")
return []
# Usar path absoluto
sample_path = str(Path(sample_path).resolve())
if sample_path not in self.normalized_embeddings:
print(f"[EmbeddingEngine] Sample not found: {sample_path}")
return []
reference_emb = self.normalized_embeddings[sample_path]
# Calcular distancias
distances = []
distance_func = self.cosine_distance if use_cosine else self.euclidean_distance
for path, emb in self.normalized_embeddings.items():
if path != sample_path: # Excluir el propio sample
dist = distance_func(reference_emb, emb)
distances.append((path, dist))
# Ordenar por distancia (menor = más similar)
distances.sort(key=lambda x: x[1])
return distances[:top_n]
def find_by_audio_reference(self, audio_file_path: str, top_n: int = 20,
use_cosine: bool = True) -> List[Tuple[str, float]]:
"""
Analiza un archivo de audio y encuentra samples similares.
Args:
audio_file_path: Ruta del archivo de audio a analizar
top_n: Número de samples similares a retornar
use_cosine: True para usar distancia coseno
Returns:
List[Tuple[str, float]]: Lista de (path, distancia) ordenada por similitud
"""
if not self.normalized_embeddings:
print("[EmbeddingEngine] No embeddings available")
return []
# Intentar usar el analyzer para extraer features
features = None
if HAS_ANALYZER:
try:
analyzer = LibreriaAnalyzer()
features = analyzer.analyze_single_file(audio_file_path)
except Exception as e:
print(f"[EmbeddingEngine] Error analyzing reference: {e}")
if features is None:
# Fallback: crear features mínimas
print("[EmbeddingEngine] Using fallback analysis")
features = self._fallback_analyze(audio_file_path)
if features is None:
print(f"[EmbeddingEngine] Could not analyze: {audio_file_path}")
return []
# Crear embedding para el audio de referencia
reference_emb = self.create_embedding(features)
# Normalizar usando los mismos min/max que el índice
if self.min_values is not None and self.max_values is not None:
ranges = self.max_values - self.min_values
ranges[ranges == 0] = 1.0
reference_emb = (reference_emb - self.min_values) / ranges
# Calcular distancias
distances = []
distance_func = self.cosine_distance if use_cosine else self.euclidean_distance
for path, emb in self.normalized_embeddings.items():
dist = distance_func(reference_emb, emb)
distances.append((path, dist))
# Ordenar por distancia
distances.sort(key=lambda x: x[1])
return distances[:top_n]
def _fallback_analyze(self, audio_file_path: str) -> Optional[Dict]:
"""
Análisis fallback básico cuando librosa no está disponible.
Args:
audio_file_path: Ruta del archivo
Returns:
Dict con features mínimas o None
"""
try:
# Información básica del archivo
stat = os.stat(audio_file_path)
# Valores por defecto basados en reggaetón típico
return {
'bpm': 95.0,
'key': 'C',
'rms': -12.0,
'spectral_centroid': 3000.0,
'spectral_rolloff': 8000.0,
'zero_crossing_rate': 0.1,
'mfccs': [0.0] * 13,
'onset_strength': 0.6,
'duration': 4.0,
'sample_rate': 44100,
'channels': 2
}
except Exception:
return None
def get_embedding(self, sample_path: str) -> Optional[np.ndarray]:
"""
Obtiene el embedding de un sample específico.
Args:
sample_path: Ruta del sample
Returns:
np.ndarray: Embedding del sample o None si no existe
"""
sample_path = str(Path(sample_path).resolve())
return self.normalized_embeddings.get(sample_path)
def get_stats(self) -> Dict:
"""
Retorna estadísticas de los embeddings.
Returns:
Dict con estadísticas
"""
if not self.normalized_embeddings:
return {'total_samples': 0}
matrix = np.array(list(self.normalized_embeddings.values()))
return {
'total_samples': len(self.normalized_embeddings),
'dimensions': self.EMBEDDING_DIM,
'mean_per_dim': matrix.mean(axis=0).tolist(),
'std_per_dim': matrix.std(axis=0).tolist(),
'min_per_dim': matrix.min(axis=0).tolist(),
'max_per_dim': matrix.max(axis=0).tolist()
}
# Funciones de conveniencia para uso directo
def create_embeddings_index(features_file: Optional[str] = None,
output_file: Optional[str] = None) -> EmbeddingEngine:
"""
Crea el índice de embeddings completo.
Args:
features_file: Ruta al archivo de features (default: .features_cache.json)
output_file: Ruta de salida (default: .embeddings_index.json)
Returns:
EmbeddingEngine configurado con embeddings creados
"""
engine = EmbeddingEngine()
if features_file:
with open(features_file, 'r') as f:
features_data = json.load(f)
engine.build_from_features(features_data)
else:
engine.build_from_features()
if output_file:
engine.EMBEDDINGS_FILE = Path(output_file)
engine.save_embeddings()
return engine
def find_similar_samples(sample_path: str, top_n: int = 10,
embeddings_file: Optional[str] = None) -> List[Tuple[str, float]]:
"""
Función de conveniencia para encontrar samples similares.
Args:
sample_path: Ruta del sample de referencia
top_n: Número de resultados
embeddings_file: Ruta al archivo de embeddings (opcional)
Returns:
Lista de (path, distancia)
"""
engine = EmbeddingEngine()
if embeddings_file:
engine.EMBEDDINGS_FILE = Path(embeddings_file)
engine._load_embeddings()
return engine.find_similar(sample_path, top_n)
def find_samples_like_audio(audio_path: str, top_n: int = 20,
embeddings_file: Optional[str] = None) -> List[Tuple[str, float]]:
"""
Función de conveniencia para encontrar samples similares a un audio.
Args:
audio_path: Ruta del audio de referencia
top_n: Número de resultados
embeddings_file: Ruta al archivo de embeddings (opcional)
Returns:
Lista de (path, distancia)
"""
engine = EmbeddingEngine()
if embeddings_file:
engine.EMBEDDINGS_FILE = Path(embeddings_file)
engine._load_embeddings()
return engine.find_by_audio_reference(audio_path, top_n)
def cosine_similarity(emb1, emb2) -> float:
"""Compatibility helper used by server.py."""
v1 = np.asarray(emb1, dtype=float)
v2 = np.asarray(emb2, dtype=float)
denom = np.linalg.norm(v1) * np.linalg.norm(v2)
if denom == 0:
return 0.0
return float(np.dot(v1, v2) / denom)
# Test simple
if __name__ == '__main__':
print("[EmbeddingEngine] Running basic tests...")
# Test 1: Crear embedding de features dummy
dummy_features = {
'bpm': 95,
'key': 'C',
'rms': -12.5,
'spectral_centroid': 2500.0,
'spectral_rolloff': 8000.0,
'zero_crossing_rate': 0.15,
'mfccs': [0.5, -0.3, 0.1, 0.2, -0.1, 0.0, 0.3, -0.2, 0.1, 0.0, -0.1, 0.2, 0.1],
'onset_strength': 0.85,
'duration': 0.5,
'sample_rate': 44100,
'channels': 1
}
engine = EmbeddingEngine()
emb = engine.create_embedding(dummy_features)
print(f"[Test] Created embedding with shape: {emb.shape}")
print(f"[Test] Embedding values: {emb[:5]}...")
print(f"[Test] Embedding range: [{emb.min():.3f}, {emb.max():.3f}]")
# Test 2: Normalización
engine.embeddings = {
'sample1.wav': emb,
'sample2.wav': emb * 0.8,
'sample3.wav': emb * 1.2
}
engine.normalize_embeddings()
print(f"[Test] Normalized {len(engine.normalized_embeddings)} embeddings")
# Test 3: Distancia coseno
dist = engine.cosine_distance(emb, emb * 0.9)
print(f"[Test] Cosine distance (emb vs 0.9*emb): {dist:.4f}")
print("[EmbeddingEngine] All tests passed!")