""" IntelligentSampleSelector - Coherent Sample Selection Engine Uses embeddings from .embeddings_index.json to select samples that work together musically based on cosine similarity. Architecture: - Embeddings-based similarity using cosine distance - Energy matching for intensity coherence - Coherence threshold: 0.90 (configurable) - Never falls back to random selection """ import json import os import logging from pathlib import Path from typing import List, Dict, Any, Optional, Tuple, NamedTuple from dataclasses import dataclass import numpy as np logger = logging.getLogger(__name__) class CoherenceError(Exception): """Raised when no samples meet the coherence threshold.""" def __init__(self, message: str, details: Optional[Dict[str, Any]] = None): super().__init__(message) self.details = details or {} @dataclass class SelectionRationale: """Tracks why a sample was selected.""" sample_path: str similarity_to_anchor: float energy_match: bool energy_delta: float selection_reason: str @dataclass class SelectedSample: """A selected sample with metadata.""" path: str role: str energy: float coherence_score: float rationale: SelectionRationale class IntelligentSampleSelector: """ Selects coherent sample sets using embedding-based similarity. Uses embeddings from .embeddings_index.json and calculates cosine similarity to find samples that work together musically. Coherence threshold: 0.90 (samples must be 90% similar) Energy matching: ±10% of target energy Never falls back to random selection - raises CoherenceError if no samples meet criteria. """ def __init__( self, embeddings_path: Optional[str] = None, coherence_threshold: float = 0.90, energy_tolerance: float = 0.10 ): """ Initialize the selector. Args: embeddings_path: Path to .embeddings_index.json coherence_threshold: Minimum cosine similarity (default 0.90) energy_tolerance: Energy matching tolerance (default 0.10 = ±10%) """ self.coherence_threshold = coherence_threshold self.energy_tolerance = energy_tolerance self.embeddings: Dict[str, np.ndarray] = {} self.metadata: Dict[str, Dict[str, Any]] = {} self.rationale_log: List[SelectionRationale] = [] # Default path: project root / .embeddings_index.json if embeddings_path is None: # Try to find embeddings in project root script_dir = Path(__file__).parent.parent.parent embeddings_path = str(script_dir / ".." / "libreria" / "reggaeton" / ".embeddings_index.json") self.embeddings_path = embeddings_path self._load_embeddings() def _load_embeddings(self) -> None: """Load embeddings and metadata from JSON file.""" if not os.path.exists(self.embeddings_path): raise FileNotFoundError( f"Embeddings file not found: {self.embeddings_path}. " f"Run sample analysis first to generate embeddings." ) try: with open(self.embeddings_path, 'r', encoding='utf-8') as f: data = json.load(f) # Load embeddings (support both formats) if "embeddings" in data: # Format: { "embeddings": { "path": [vector], ... } } for sample_path, vector in data["embeddings"].items(): if vector and len(vector) > 0: self.embeddings[sample_path] = np.array(vector, dtype=np.float32) # Infer role from folder name folder = os.path.basename(os.path.dirname(sample_path)) self.metadata[sample_path] = { "path": sample_path, "energy": vector[3] if len(vector) > 3 else 0.0, # RMS is typically index 3 "bpm": vector[1] * 200 if len(vector) > 1 else 0.0, # Denormalize BPM "key": "", # Not stored in this format "role": folder, } elif "samples" in data: # Format: { "samples": { "id": { "embedding": [...], ... } } } for sample_id, info in data["samples"].items(): embedding = info.get("embedding") if embedding: self.embeddings[sample_id] = np.array(embedding, dtype=np.float32) self.metadata[sample_id] = { "path": info.get("path", ""), "energy": info.get("energy", 0.0), "bpm": info.get("bpm", 0.0), "key": info.get("key", ""), "role": info.get("role", "unknown"), } logger.info( f"Loaded {len(self.embeddings)} embeddings from {self.embeddings_path}" ) except json.JSONDecodeError as e: raise ValueError(f"Invalid embeddings JSON: {e}") except Exception as e: raise RuntimeError(f"Failed to load embeddings: {e}") def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float: """ Calculate cosine similarity between two vectors. Formula: dot(a, b) / (norm(a) * norm(b)) Args: a: First embedding vector b: Second embedding vector Returns: Cosine similarity in range [-1, 1], typically [0, 1] """ norm_a = np.linalg.norm(a) norm_b = np.linalg.norm(b) if norm_a == 0 or norm_b == 0: return 0.0 return float(np.dot(a, b) / (norm_a * norm_b)) def _get_sample_energy(self, sample_id: str) -> float: """Get RMS energy for a sample.""" return self.metadata.get(sample_id, {}).get("energy", 0.0) def _energy_matches(self, sample_energy: float, target_energy: float) -> Tuple[bool, float]: """ Check if sample energy matches target within tolerance. Args: sample_energy: Sample's RMS energy target_energy: Target energy level Returns: Tuple of (matches, delta) where delta is the relative difference """ if target_energy == 0: return True, 0.0 delta = abs(sample_energy - target_energy) / target_energy matches = delta <= self.energy_tolerance return matches, delta def _get_samples_by_role(self, role: str) -> List[str]: """Get all sample IDs matching a role.""" return [ sid for sid, meta in self.metadata.items() if meta.get("role", "").lower() == role.lower() ] def select_anchor_sample( self, role: str, target_energy: float ) -> Tuple[str, SelectionRationale]: """ Find the most representative sample for a role and energy level. The anchor is the sample that best represents the target characteristics and has the most similar samples around it (highest local density). Args: role: Sample role (e.g., "kick", "snare", "bass") target_energy: Target RMS energy level Returns: Tuple of (sample_id, rationale) Raises: CoherenceError: If no samples found for role or no energy matches """ role_samples = self._get_samples_by_role(role) if not role_samples: available_roles = set( m.get("role", "unknown") for m in self.metadata.values() ) raise CoherenceError( f"No samples found for role: {role}", details={ "requested_role": role, "available_roles": list(available_roles), "total_samples": len(self.metadata) } ) # Score each sample by: energy match + similarity to other samples scored_samples: List[Tuple[str, float, float]] = [] # (id, score, energy) for sample_id in role_samples: sample_energy = self._get_sample_energy(sample_id) energy_matches, energy_delta = self._energy_matches( sample_energy, target_energy ) # Skip samples with wildly different energy (optional, can be disabled) if not energy_matches: continue # Calculate average similarity to other samples in role if sample_id not in self.embeddings: continue similarities = [] for other_id in role_samples: if other_id != sample_id and other_id in self.embeddings: sim = self._cosine_similarity( self.embeddings[sample_id], self.embeddings[other_id] ) similarities.append(sim) avg_similarity = np.mean(similarities) if similarities else 0.0 # Score: high similarity + energy match # Weight: 70% similarity, 30% energy match energy_score = 1.0 - energy_delta total_score = (0.7 * avg_similarity) + (0.3 * energy_score) scored_samples.append((sample_id, total_score, sample_energy)) if not scored_samples: raise CoherenceError( f"No samples match energy target for role '{role}'", details={ "role": role, "target_energy": target_energy, "tolerance": self.energy_tolerance, "candidates": len(role_samples), "sample_energies": [ self._get_sample_energy(sid) for sid in role_samples[:10] ] } ) # Select best sample scored_samples.sort(key=lambda x: x[1], reverse=True) anchor_id, score, anchor_energy = scored_samples[0] rationale = SelectionRationale( sample_path=self.metadata[anchor_id].get("path", anchor_id), similarity_to_anchor=1.0, # Self-similarity energy_match=True, energy_delta=abs(anchor_energy - target_energy) / target_energy if target_energy else 0.0, selection_reason=f"Highest representativeness score ({score:.3f}) for role '{role}' at energy {target_energy:.3f}" ) logger.info( f"Selected anchor for {role}: {anchor_id} (score={score:.3f}, energy={anchor_energy:.3f})" ) return anchor_id, rationale def find_similar_samples( self, reference_path: str, count: int = 5, min_similarity: float = 0.90, role_filter: Optional[str] = None ) -> List[Tuple[str, float, SelectionRationale]]: """ Find samples similar to a reference sample. Args: reference_path: Path or ID of reference sample count: Number of similar samples to return min_similarity: Minimum cosine similarity threshold role_filter: Optional role to filter by Returns: List of (sample_id, similarity, rationale) tuples, sorted by similarity Raises: CoherenceError: If no samples meet the similarity threshold """ # Find reference sample reference_id = None for sid, meta in self.metadata.items(): if meta.get("path") == reference_path or sid == reference_path: reference_id = sid break if reference_id is None: raise CoherenceError( f"Reference sample not found: {reference_path}", details={ "reference": reference_path, "available_samples": len(self.metadata) } ) if reference_id not in self.embeddings: raise CoherenceError( f"Reference sample has no embedding: {reference_path}", details={"reference_id": reference_id} ) reference_embedding = self.embeddings[reference_id] reference_energy = self._get_sample_energy(reference_id) # Calculate similarity to all samples similarities: List[Tuple[str, float, float]] = [] # (id, similarity, energy) for sample_id, embedding in self.embeddings.items(): if sample_id == reference_id: continue # Apply role filter if role_filter: sample_role = self.metadata.get(sample_id, {}).get("role", "") if sample_role.lower() != role_filter.lower(): continue sim = self._cosine_similarity(reference_embedding, embedding) energy = self._get_sample_energy(sample_id) similarities.append((sample_id, sim, energy)) # Filter by minimum similarity above_threshold = [(sid, sim, e) for sid, sim, e in similarities if sim >= min_similarity] if not above_threshold: # Find closest match for error details similarities.sort(key=lambda x: x[1], reverse=True) best_match = similarities[0] if similarities else (None, 0.0, 0.0) raise CoherenceError( f"No samples meet similarity threshold {min_similarity} for {reference_path}", details={ "reference": reference_path, "min_similarity": min_similarity, "best_match_similarity": best_match[1] if best_match[0] else 0.0, "best_match_id": best_match[0], "candidates_checked": len(similarities), "similarity_distribution": { "above_95": len([s for s in similarities if s[1] >= 0.95]), "above_90": len([s for s in similarities if s[1] >= 0.90]), "above_85": len([s for s in similarities if s[1] >= 0.85]), "above_80": len([s for s in similarities if s[1] >= 0.80]), } } ) # Sort and select top matches above_threshold.sort(key=lambda x: x[1], reverse=True) top_matches = above_threshold[:count] results: List[Tuple[str, float, SelectionRationale]] = [] for sample_id, similarity, sample_energy in top_matches: energy_matches, energy_delta = self._energy_matches( sample_energy, reference_energy ) rationale = SelectionRationale( sample_path=self.metadata[sample_id].get("path", sample_id), similarity_to_anchor=similarity, energy_match=energy_matches, energy_delta=energy_delta, selection_reason=f"Cosine similarity {similarity:.3f} >= {min_similarity} to reference" ) results.append((sample_id, similarity, rationale)) logger.info( f"Found {len(results)} samples similar to {reference_id} " f"(threshold={min_similarity})" ) return results def calculate_kit_coherence(self, sample_paths: List[str]) -> float: """ Calculate the coherence score of a kit (set of samples). Coherence is defined as the average pairwise cosine similarity between all samples in the set. Range: 0.0 to 1.0 Args: sample_paths: List of sample paths or IDs Returns: Coherence score from 0.0 (no coherence) to 1.0 (perfect coherence) """ if len(sample_paths) < 2: return 1.0 # Single sample is perfectly coherent with itself # Resolve paths to IDs sample_ids = [] for path in sample_paths: found_id = None for sid, meta in self.metadata.items(): if meta.get("path") == path or sid == path: found_id = sid break if found_id: sample_ids.append(found_id) if len(sample_ids) < 2: logger.warning(f"Only {len(sample_ids)} valid samples for coherence calculation") return 0.0 # Calculate pairwise similarities similarities = [] for i, id1 in enumerate(sample_ids): if id1 not in self.embeddings: continue for id2 in sample_ids[i+1:]: if id2 not in self.embeddings: continue sim = self._cosine_similarity( self.embeddings[id1], self.embeddings[id2] ) similarities.append(sim) if not similarities: return 0.0 coherence = float(np.mean(similarities)) logger.info( f"Kit coherence: {coherence:.3f} (from {len(similarities)} pairwise comparisons)" ) return coherence def select_coherent_kit( self, role: str, target_energy: float, count: int = 4 ) -> List[SelectedSample]: """ Select a coherent kit of samples for a role. Selects an anchor sample and finds variations that are: 1. Similar to the anchor (cosine similarity >= 0.90) 2. Within ±10% of target energy 3. Coherent with each other Args: role: Sample role (e.g., "kick", "snare", "hihat", "bass") target_energy: Target RMS energy level count: Number of samples to select (default 4: 1 anchor + 3 variations) Returns: List of SelectedSample objects with coherence scores and rationale Raises: CoherenceError: If no coherent kit can be formed """ logger.info( f"Selecting coherent kit for role='{role}', energy={target_energy:.3f}, count={count}" ) # Clear rationale log for this selection self.rationale_log = [] # Step 1: Select anchor sample anchor_id, anchor_rationale = self.select_anchor_sample(role, target_energy) selected_ids = [anchor_id] # Step 2: Find similar samples to anchor anchor_path = self.metadata[anchor_id].get("path", anchor_id) try: similar = self.find_similar_samples( reference_path=anchor_path, count=count - 1, # Exclude anchor min_similarity=self.coherence_threshold, role_filter=role # Must be same role ) except CoherenceError as e: # Enhance error with kit context raise CoherenceError( f"Cannot form coherent kit for '{role}': {str(e)}", details={ **getattr(e, 'details', {}), "anchor_sample": anchor_id, "target_count": count, "role": role } ) # Step 3: Build selected samples list with rationale selected: List[SelectedSample] = [] # Add anchor anchor_energy = self._get_sample_energy(anchor_id) selected.append(SelectedSample( path=self.metadata[anchor_id].get("path", anchor_id), role=role, energy=anchor_energy, coherence_score=1.0, rationale=anchor_rationale )) self.rationale_log.append(anchor_rationale) # Add variations for sample_id, similarity, rationale in similar: if len(selected) >= count: break sample_energy = self._get_sample_energy(sample_id) selected.append(SelectedSample( path=self.metadata[sample_id].get("path", sample_id), role=role, energy=sample_energy, coherence_score=similarity, rationale=rationale )) self.rationale_log.append(rationale) # Step 4: Verify kit coherence kit_paths = [s.path for s in selected] kit_coherence = self.calculate_kit_coherence(kit_paths) if kit_coherence < self.coherence_threshold: raise CoherenceError( f"Selected kit coherence {kit_coherence:.3f} below threshold {self.coherence_threshold}", details={ "kit_coherence": kit_coherence, "threshold": self.coherence_threshold, "samples_selected": len(selected), "role": role, "sample_paths": kit_paths } ) logger.info( f"Selected coherent kit: {len(selected)} samples, coherence={kit_coherence:.3f}" ) return selected def get_selection_log(self) -> List[Dict[str, Any]]: """Get the rationale log as a list of dictionaries.""" return [ { "sample_path": r.sample_path, "similarity_to_anchor": round(r.similarity_to_anchor, 4), "energy_match": r.energy_match, "energy_delta": round(r.energy_delta, 4), "selection_reason": r.selection_reason } for r in self.rationale_log ] def get_available_roles(self) -> List[str]: """Get list of available sample roles in the embeddings.""" roles = set() for meta in self.metadata.values(): role = meta.get("role", "") if role: roles.add(role) return sorted(list(roles)) def get_stats(self) -> Dict[str, Any]: """Get statistics about the embeddings database.""" role_counts = {} for meta in self.metadata.values(): role = meta.get("role", "unknown") role_counts[role] = role_counts.get(role, 0) + 1 return { "total_samples": len(self.embeddings), "embeddings_path": self.embeddings_path, "coherence_threshold": self.coherence_threshold, "energy_tolerance": self.energy_tolerance, "roles": role_counts, "embedding_dim": len(next(iter(self.embeddings.values()))) if self.embeddings else 0 } # Convenience functions for direct usage def select_kick_kit(target_energy: float, count: int = 4) -> List[SelectedSample]: """Select a coherent kick drum kit.""" selector = IntelligentSampleSelector() return selector.select_coherent_kit("kick", target_energy, count) def select_snare_kit(target_energy: float, count: int = 4) -> List[SelectedSample]: """Select a coherent snare drum kit.""" selector = IntelligentSampleSelector() return selector.select_coherent_kit("snare", target_energy, count) def select_bass_kit(target_energy: float, count: int = 4) -> List[SelectedSample]: """Select a coherent bass kit.""" selector = IntelligentSampleSelector() return selector.select_coherent_kit("bass", target_energy, count) def find_similar(reference_path: str, count: int = 5) -> List[Tuple[str, float]]: """Find samples similar to a reference.""" selector = IntelligentSampleSelector() results = selector.find_similar_samples(reference_path, count) return [(r.path, score) for _, score, r in results]