ableton-mcp-ai/mcp_server/engines/intelligent_selector.py

"""
IntelligentSampleSelector - Coherent Sample Selection Engine

Uses embeddings from .embeddings_index.json to select samples that work
together musically based on cosine similarity.

Architecture:
- Embeddings-based similarity using cosine distance
- Energy matching for intensity coherence
- Coherence threshold: 0.90 (configurable)
- Never falls back to random selection
"""

import json
import os
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple, NamedTuple
from dataclasses import dataclass
import numpy as np

logger = logging.getLogger(__name__)


class CoherenceError(Exception):
    """Raised when no samples meet the coherence threshold."""

    def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
        super().__init__(message)
        self.details = details or {}


@dataclass
class SelectionRationale:
    """Tracks why a sample was selected."""
    sample_path: str
    similarity_to_anchor: float
    energy_match: bool
    energy_delta: float
    selection_reason: str


@dataclass
class SelectedSample:
    """A selected sample with metadata."""
    path: str
    role: str
    energy: float
    coherence_score: float
    rationale: SelectionRationale


class IntelligentSampleSelector:
    """
    Selects coherent sample sets using embedding-based similarity.

    Uses embeddings from .embeddings_index.json and calculates
    cosine similarity to find samples that work together musically.

    Coherence threshold: 0.90 (samples must be 90% similar)
    Energy matching: ±10% of target energy

    Never falls back to random selection - raises CoherenceError if
    no samples meet criteria.
    """

    def __init__(
        self,
        embeddings_path: Optional[str] = None,
        coherence_threshold: float = 0.90,
        energy_tolerance: float = 0.10
    ):
        """
        Initialize the selector.

        Args:
            embeddings_path: Path to .embeddings_index.json
            coherence_threshold: Minimum cosine similarity (default 0.90)
            energy_tolerance: Energy matching tolerance (default 0.10 = ±10%)
        """
        self.coherence_threshold = coherence_threshold
        self.energy_tolerance = energy_tolerance
        self.embeddings: Dict[str, np.ndarray] = {}
        self.metadata: Dict[str, Dict[str, Any]] = {}
        self.rationale_log: List[SelectionRationale] = []

        # Default path: project root / .embeddings_index.json
        if embeddings_path is None:
            # Try to find embeddings in project root
            script_dir = Path(__file__).parent.parent.parent
            embeddings_path = str(script_dir / ".." / "libreria" / "reggaeton" / ".embeddings_index.json")

        self.embeddings_path = embeddings_path
        self._load_embeddings()

    def _load_embeddings(self) -> None:
        """Load embeddings and metadata from JSON file."""
        if not os.path.exists(self.embeddings_path):
            raise FileNotFoundError(
                f"Embeddings file not found: {self.embeddings_path}. "
                f"Run sample analysis first to generate embeddings."
            )

        try:
            with open(self.embeddings_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Load embeddings (support both formats)
            if "embeddings" in data:
                # Format: { "embeddings": { "path": [vector], ... } }
                for sample_path, vector in data["embeddings"].items():
                    if vector and len(vector) > 0:
                        self.embeddings[sample_path] = np.array(vector, dtype=np.float32)
                        # Infer role from folder name
                        folder = os.path.basename(os.path.dirname(sample_path))
                        self.metadata[sample_path] = {
                            "path": sample_path,
                            "energy": vector[3] if len(vector) > 3 else 0.0,  # RMS is typically index 3
                            "bpm": vector[1] * 200 if len(vector) > 1 else 0.0,  # Denormalize BPM
                            "key": "",  # Not stored in this format
                            "role": folder,
                        }
            elif "samples" in data:
                # Format: { "samples": { "id": { "embedding": [...], ... } } }
                for sample_id, info in data["samples"].items():
                    embedding = info.get("embedding")
                    if embedding:
                        self.embeddings[sample_id] = np.array(embedding, dtype=np.float32)
                        self.metadata[sample_id] = {
                            "path": info.get("path", ""),
                            "energy": info.get("energy", 0.0),
                            "bpm": info.get("bpm", 0.0),
                            "key": info.get("key", ""),
                            "role": info.get("role", "unknown"),
                        }

            logger.info(
                f"Loaded {len(self.embeddings)} embeddings from {self.embeddings_path}"
            )

        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid embeddings JSON: {e}")
        except Exception as e:
            raise RuntimeError(f"Failed to load embeddings: {e}")

    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
        """
        Calculate cosine similarity between two vectors.

        Formula: dot(a, b) / (norm(a) * norm(b))

        Args:
            a: First embedding vector
            b: Second embedding vector

        Returns:
            Cosine similarity in range [-1, 1], typically [0, 1]
        """
        norm_a = np.linalg.norm(a)
        norm_b = np.linalg.norm(b)

        if norm_a == 0 or norm_b == 0:
            return 0.0

        return float(np.dot(a, b) / (norm_a * norm_b))

    def _get_sample_energy(self, sample_id: str) -> float:
        """Get RMS energy for a sample."""
        return self.metadata.get(sample_id, {}).get("energy", 0.0)

    def _energy_matches(self, sample_energy: float, target_energy: float) -> Tuple[bool, float]:
        """
        Check if sample energy matches target within tolerance.

        Args:
            sample_energy: Sample's RMS energy
            target_energy: Target energy level

        Returns:
            Tuple of (matches, delta) where delta is the relative difference
        """
        if target_energy == 0:
            return True, 0.0

        delta = abs(sample_energy - target_energy) / target_energy
        matches = delta <= self.energy_tolerance
        return matches, delta

    def _get_samples_by_role(self, role: str) -> List[str]:
        """Get all sample IDs matching a role."""
        return [
            sid for sid, meta in self.metadata.items()
            if meta.get("role", "").lower() == role.lower()
        ]

    def select_anchor_sample(
        self,
        role: str,
        target_energy: float
    ) -> Tuple[str, SelectionRationale]:
        """
        Find the most representative sample for a role and energy level.

        The anchor is the sample that best represents the target characteristics
        and has the most similar samples around it (highest local density).

        Args:
            role: Sample role (e.g., "kick", "snare", "bass")
            target_energy: Target RMS energy level

        Returns:
            Tuple of (sample_id, rationale)

        Raises:
            CoherenceError: If no samples found for role or no energy matches
        """
        role_samples = self._get_samples_by_role(role)

        if not role_samples:
            available_roles = set(
                m.get("role", "unknown") for m in self.metadata.values()
            )
            raise CoherenceError(
                f"No samples found for role: {role}",
                details={
                    "requested_role": role,
                    "available_roles": list(available_roles),
                    "total_samples": len(self.metadata)
                }
            )

        # Score each sample by: energy match + similarity to other samples
        scored_samples: List[Tuple[str, float, float]] = []  # (id, score, energy)

        for sample_id in role_samples:
            sample_energy = self._get_sample_energy(sample_id)
            energy_matches, energy_delta = self._energy_matches(
                sample_energy, target_energy
            )

            # Skip samples with wildly different energy (optional, can be disabled)
            if not energy_matches:
                continue

            # Calculate average similarity to other samples in role
            if sample_id not in self.embeddings:
                continue

            similarities = []
            for other_id in role_samples:
                if other_id != sample_id and other_id in self.embeddings:
                    sim = self._cosine_similarity(
                        self.embeddings[sample_id],
                        self.embeddings[other_id]
                    )
                    similarities.append(sim)

            avg_similarity = np.mean(similarities) if similarities else 0.0

            # Score: high similarity + energy match
            # Weight: 70% similarity, 30% energy match
            energy_score = 1.0 - energy_delta
            total_score = (0.7 * avg_similarity) + (0.3 * energy_score)

            scored_samples.append((sample_id, total_score, sample_energy))

        if not scored_samples:
            raise CoherenceError(
                f"No samples match energy target for role '{role}'",
                details={
                    "role": role,
                    "target_energy": target_energy,
                    "tolerance": self.energy_tolerance,
                    "candidates": len(role_samples),
                    "sample_energies": [
                        self._get_sample_energy(sid) for sid in role_samples[:10]
                    ]
                }
            )

        # Select best sample
        scored_samples.sort(key=lambda x: x[1], reverse=True)
        anchor_id, score, anchor_energy = scored_samples[0]

        rationale = SelectionRationale(
            sample_path=self.metadata[anchor_id].get("path", anchor_id),
            similarity_to_anchor=1.0,  # Self-similarity
            energy_match=True,
            energy_delta=abs(anchor_energy - target_energy) / target_energy if target_energy else 0.0,
            selection_reason=f"Highest representativeness score ({score:.3f}) for role '{role}' at energy {target_energy:.3f}"
        )

        logger.info(
            f"Selected anchor for {role}: {anchor_id} (score={score:.3f}, energy={anchor_energy:.3f})"
        )

        return anchor_id, rationale

    def find_similar_samples(
        self,
        reference_path: str,
        count: int = 5,
        min_similarity: float = 0.90,
        role_filter: Optional[str] = None
    ) -> List[Tuple[str, float, SelectionRationale]]:
        """
        Find samples similar to a reference sample.

        Args:
            reference_path: Path or ID of reference sample
            count: Number of similar samples to return
            min_similarity: Minimum cosine similarity threshold
            role_filter: Optional role to filter by

        Returns:
            List of (sample_id, similarity, rationale) tuples, sorted by similarity

        Raises:
            CoherenceError: If no samples meet the similarity threshold
        """
        # Find reference sample
        reference_id = None
        for sid, meta in self.metadata.items():
            if meta.get("path") == reference_path or sid == reference_path:
                reference_id = sid
                break

        if reference_id is None:
            raise CoherenceError(
                f"Reference sample not found: {reference_path}",
                details={
                    "reference": reference_path,
                    "available_samples": len(self.metadata)
                }
            )

        if reference_id not in self.embeddings:
            raise CoherenceError(
                f"Reference sample has no embedding: {reference_path}",
                details={"reference_id": reference_id}
            )

        reference_embedding = self.embeddings[reference_id]
        reference_energy = self._get_sample_energy(reference_id)

        # Calculate similarity to all samples
        similarities: List[Tuple[str, float, float]] = []  # (id, similarity, energy)

        for sample_id, embedding in self.embeddings.items():
            if sample_id == reference_id:
                continue

            # Apply role filter
            if role_filter:
                sample_role = self.metadata.get(sample_id, {}).get("role", "")
                if sample_role.lower() != role_filter.lower():
                    continue

            sim = self._cosine_similarity(reference_embedding, embedding)
            energy = self._get_sample_energy(sample_id)
            similarities.append((sample_id, sim, energy))

        # Filter by minimum similarity
        above_threshold = [(sid, sim, e) for sid, sim, e in similarities if sim >= min_similarity]

        if not above_threshold:
            # Find closest match for error details
            similarities.sort(key=lambda x: x[1], reverse=True)
            best_match = similarities[0] if similarities else (None, 0.0, 0.0)

            raise CoherenceError(
                f"No samples meet similarity threshold {min_similarity} for {reference_path}",
                details={
                    "reference": reference_path,
                    "min_similarity": min_similarity,
                    "best_match_similarity": best_match[1] if best_match[0] else 0.0,
                    "best_match_id": best_match[0],
                    "candidates_checked": len(similarities),
                    "similarity_distribution": {
                        "above_95": len([s for s in similarities if s[1] >= 0.95]),
                        "above_90": len([s for s in similarities if s[1] >= 0.90]),
                        "above_85": len([s for s in similarities if s[1] >= 0.85]),
                        "above_80": len([s for s in similarities if s[1] >= 0.80]),
                    }
                }
            )

        # Sort and select top matches
        above_threshold.sort(key=lambda x: x[1], reverse=True)
        top_matches = above_threshold[:count]

        results: List[Tuple[str, float, SelectionRationale]] = []

        for sample_id, similarity, sample_energy in top_matches:
            energy_matches, energy_delta = self._energy_matches(
                sample_energy, reference_energy
            )

            rationale = SelectionRationale(
                sample_path=self.metadata[sample_id].get("path", sample_id),
                similarity_to_anchor=similarity,
                energy_match=energy_matches,
                energy_delta=energy_delta,
                selection_reason=f"Cosine similarity {similarity:.3f} >= {min_similarity} to reference"
            )

            results.append((sample_id, similarity, rationale))

        logger.info(
            f"Found {len(results)} samples similar to {reference_id} "
            f"(threshold={min_similarity})"
        )

        return results

    def calculate_kit_coherence(self, sample_paths: List[str]) -> float:
        """
        Calculate the coherence score of a kit (set of samples).

        Coherence is defined as the average pairwise cosine similarity
        between all samples in the set. Range: 0.0 to 1.0

        Args:
            sample_paths: List of sample paths or IDs

        Returns:
            Coherence score from 0.0 (no coherence) to 1.0 (perfect coherence)
        """
        if len(sample_paths) < 2:
            return 1.0  # Single sample is perfectly coherent with itself

        # Resolve paths to IDs
        sample_ids = []
        for path in sample_paths:
            found_id = None
            for sid, meta in self.metadata.items():
                if meta.get("path") == path or sid == path:
                    found_id = sid
                    break
            if found_id:
                sample_ids.append(found_id)

        if len(sample_ids) < 2:
            logger.warning(f"Only {len(sample_ids)} valid samples for coherence calculation")
            return 0.0

        # Calculate pairwise similarities
        similarities = []
        for i, id1 in enumerate(sample_ids):
            if id1 not in self.embeddings:
                continue
            for id2 in sample_ids[i+1:]:
                if id2 not in self.embeddings:
                    continue
                sim = self._cosine_similarity(
                    self.embeddings[id1],
                    self.embeddings[id2]
                )
                similarities.append(sim)

        if not similarities:
            return 0.0

        coherence = float(np.mean(similarities))

        logger.info(
            f"Kit coherence: {coherence:.3f} (from {len(similarities)} pairwise comparisons)"
        )

        return coherence

    def select_coherent_kit(
        self,
        role: str,
        target_energy: float,
        count: int = 4
    ) -> List[SelectedSample]:
        """
        Select a coherent kit of samples for a role.

        Selects an anchor sample and finds variations that are:
        1. Similar to the anchor (cosine similarity >= 0.90)
        2. Within ±10% of target energy
        3. Coherent with each other

        Args:
            role: Sample role (e.g., "kick", "snare", "hihat", "bass")
            target_energy: Target RMS energy level
            count: Number of samples to select (default 4: 1 anchor + 3 variations)

        Returns:
            List of SelectedSample objects with coherence scores and rationale

        Raises:
            CoherenceError: If no coherent kit can be formed
        """
        logger.info(
            f"Selecting coherent kit for role='{role}', energy={target_energy:.3f}, count={count}"
        )

        # Clear rationale log for this selection
        self.rationale_log = []

        # Step 1: Select anchor sample
        anchor_id, anchor_rationale = self.select_anchor_sample(role, target_energy)
        selected_ids = [anchor_id]

        # Step 2: Find similar samples to anchor
        anchor_path = self.metadata[anchor_id].get("path", anchor_id)

        try:
            similar = self.find_similar_samples(
                reference_path=anchor_path,
                count=count - 1,  # Exclude anchor
                min_similarity=self.coherence_threshold,
                role_filter=role  # Must be same role
            )
        except CoherenceError as e:
            # Enhance error with kit context
            raise CoherenceError(
                f"Cannot form coherent kit for '{role}': {str(e)}",
                details={
                    **getattr(e, 'details', {}),
                    "anchor_sample": anchor_id,
                    "target_count": count,
                    "role": role
                }
            )

        # Step 3: Build selected samples list with rationale
        selected: List[SelectedSample] = []

        # Add anchor
        anchor_energy = self._get_sample_energy(anchor_id)
        selected.append(SelectedSample(
            path=self.metadata[anchor_id].get("path", anchor_id),
            role=role,
            energy=anchor_energy,
            coherence_score=1.0,
            rationale=anchor_rationale
        ))
        self.rationale_log.append(anchor_rationale)

        # Add variations
        for sample_id, similarity, rationale in similar:
            if len(selected) >= count:
                break

            sample_energy = self._get_sample_energy(sample_id)

            selected.append(SelectedSample(
                path=self.metadata[sample_id].get("path", sample_id),
                role=role,
                energy=sample_energy,
                coherence_score=similarity,
                rationale=rationale
            ))
            self.rationale_log.append(rationale)

        # Step 4: Verify kit coherence
        kit_paths = [s.path for s in selected]
        kit_coherence = self.calculate_kit_coherence(kit_paths)

        if kit_coherence < self.coherence_threshold:
            raise CoherenceError(
                f"Selected kit coherence {kit_coherence:.3f} below threshold {self.coherence_threshold}",
                details={
                    "kit_coherence": kit_coherence,
                    "threshold": self.coherence_threshold,
                    "samples_selected": len(selected),
                    "role": role,
                    "sample_paths": kit_paths
                }
            )

        logger.info(
            f"Selected coherent kit: {len(selected)} samples, coherence={kit_coherence:.3f}"
        )

        return selected

    def get_selection_log(self) -> List[Dict[str, Any]]:
        """Get the rationale log as a list of dictionaries."""
        return [
            {
                "sample_path": r.sample_path,
                "similarity_to_anchor": round(r.similarity_to_anchor, 4),
                "energy_match": r.energy_match,
                "energy_delta": round(r.energy_delta, 4),
                "selection_reason": r.selection_reason
            }
            for r in self.rationale_log
        ]

    def get_available_roles(self) -> List[str]:
        """Get list of available sample roles in the embeddings."""
        roles = set()
        for meta in self.metadata.values():
            role = meta.get("role", "")
            if role:
                roles.add(role)
        return sorted(list(roles))

    def get_stats(self) -> Dict[str, Any]:
        """Get statistics about the embeddings database."""
        role_counts = {}
        for meta in self.metadata.values():
            role = meta.get("role", "unknown")
            role_counts[role] = role_counts.get(role, 0) + 1

        return {
            "total_samples": len(self.embeddings),
            "embeddings_path": self.embeddings_path,
            "coherence_threshold": self.coherence_threshold,
            "energy_tolerance": self.energy_tolerance,
            "roles": role_counts,
            "embedding_dim": len(next(iter(self.embeddings.values())))
                if self.embeddings else 0
        }


# Convenience functions for direct usage
def select_kick_kit(target_energy: float, count: int = 4) -> List[SelectedSample]:
    """Select a coherent kick drum kit."""
    selector = IntelligentSampleSelector()
    return selector.select_coherent_kit("kick", target_energy, count)


def select_snare_kit(target_energy: float, count: int = 4) -> List[SelectedSample]:
    """Select a coherent snare drum kit."""
    selector = IntelligentSampleSelector()
    return selector.select_coherent_kit("snare", target_energy, count)


def select_bass_kit(target_energy: float, count: int = 4) -> List[SelectedSample]:
    """Select a coherent bass kit."""
    selector = IntelligentSampleSelector()
    return selector.select_coherent_kit("bass", target_energy, count)


def find_similar(reference_path: str, count: int = 5) -> List[Tuple[str, float]]:
    """Find samples similar to a reference."""
    selector = IntelligentSampleSelector()
    results = selector.find_similar_samples(reference_path, count)
    return [(r.path, score) for _, score, r in results]