ableton-mcp-ai/mcp_server/engines/sample_selector.py

"""
Sample Selector - Intelligent sample selection with metadata store integration.

Indexes libreria/reggaeton and returns sample packs by genre with support for:
- Database-first queries with SQLite caching
- Graceful degradation when numpy is unavailable
- Hybrid analysis with automatic caching

Usage:
    from engines.sample_selector import SampleSelector, get_selector

    # With metadata store
    selector = SampleSelector(metadata_store=store)
    samples = selector.select_for_genre("reggaeton")

    # Without numpy (database-only mode)
    samples = selector.get_samples_without_numpy("kick", count=10)
"""

import json
import logging
import os
import random
from pathlib import Path
from typing import Optional, Dict, List, Any, Union
from dataclasses import dataclass, field

logger = logging.getLogger("SampleSelector")

# Senior Architecture: Check numpy availability
NUMPY_AVAILABLE = False
try:
    import numpy as np
    NUMPY_AVAILABLE = True
except ImportError:
    pass

LIBROSA_AVAILABLE = False
try:
    import librosa
    LIBROSA_AVAILABLE = True
except ImportError:
    pass

# Import new metadata store and abstract analyzer
from .metadata_store import SampleMetadataStore, SampleFeatures, create_metadata_store
from .abstract_analyzer import (
    HybridExtractor,
    DatabaseExtractor,
    create_extractor
)

REGGAETON_DIR = Path(
    r"C:\ProgramData\Ableton\Live 12 Suite\Resources\MIDI Remote Scripts\libreria\reggaeton"
)

_ROLE_MAP = {
    "kick": ["kick"],
    "snare": ["snare"],
    "clap": ["snare", "clap"],
    "hat_closed": ["hi-hat"],
    "hat_open": ["hi-hat"],
    "bass": ["bass"],
    "synth": ["oneshots", "reggaeton 3"],
    "fx": ["fx"],
    "perc": ["perc loop", "hi-hat"],
}


@dataclass
class SampleInfo:
    name: str
    path: str
    role: str
    pack: str = ""
    key: str = ""
    bpm: float = 0.0

    @classmethod
    def from_sample_features(cls, features: SampleFeatures, role: str = "") -> "SampleInfo":
        """Create SampleInfo from SampleFeatures."""
        return cls(
            name=Path(features.path).name,
            path=features.path,
            role=role or (features.categories[0] if features.categories else "unknown"),
            pack=Path(features.path).parent.name,
            key=features.key or "",
            bpm=features.bpm or 0.0
        )


@dataclass
class DrumKit:
    name: str
    kick: Optional[SampleInfo] = None
    snare: Optional[SampleInfo] = None
    clap: Optional[SampleInfo] = None
    hat_closed: Optional[SampleInfo] = None
    hat_open: Optional[SampleInfo] = None


@dataclass
class InstrumentGroup:
    genre: str
    key: str
    bpm: float
    drums: Optional[DrumKit] = None
    bass: List[SampleInfo] = field(default_factory=list)
    synths: List[SampleInfo] = field(default_factory=list)
    fx: List[SampleInfo] = field(default_factory=list)

    def __post_init__(self):
        if self.drums is None:
            self.drums = DrumKit(name="%s Kit" % self.genre.title())


class SampleSelector:
    """
    Intelligent sample selector with metadata store integration.

    Supports two modes:
    - Full mode (numpy available): Database + audio analysis with caching
    - Database-only mode: SQLite queries without audio analysis
    """

    def __init__(
        self,
        library_path: Optional[str] = None,
        metadata_store: Optional[SampleMetadataStore] = None,
        embedding_engine=None,
        reference_matcher=None,
        verbose: bool = False
    ):
        """
        Initialize sample selector.

        Args:
            library_path: Path to sample library (default: libreria/reggaeton)
            metadata_store: Optional metadata store instance
            embedding_engine: Optional embedding engine for similarity search
            reference_matcher: Optional reference matcher for style matching
            verbose: Enable verbose logging
        """
        self._library = Path(library_path) if library_path else REGGAETON_DIR
        self._index: List[SampleInfo] = []
        self._indexed = False
        self.verbose = verbose
        self.embedding_engine = embedding_engine
        self.reference_matcher = reference_matcher

        # Senior Architecture: Metadata store integration
        if metadata_store is None and NUMPY_AVAILABLE:
            # Only create metadata store if we can populate it
            db_path = str(self._library.parent / "sample_metadata.db")
            self.metadata_store = create_metadata_store(db_path)
            if self.verbose:
                logger.info(f"[SampleSelector] Created metadata store at {db_path}")
        elif metadata_store is not None:
            self.metadata_store = metadata_store
            if self.verbose:
                logger.info("[SampleSelector] Using provided metadata store")
        else:
            self.metadata_store = None
            logger.warning("[SampleSelector] No metadata store available")

        # Initialize extractor (Hybrid or Database-only based on numpy availability)
        self.extractor = create_extractor(self.metadata_store, verbose=verbose)

        # Track extraction mode
        if metadata_store:
            self._extraction_mode = "database_first"
            self.extraction_mode = "database_first"
        elif NUMPY_AVAILABLE and LIBROSA_AVAILABLE:
            self._extraction_mode = "full_analysis"
            self.extraction_mode = "full_analysis"
        else:
            self._extraction_mode = "limited"
            self.extraction_mode = "limited"

        if verbose:
            logger.info(f"[SampleSelector] Mode: {self.extraction_mode}")

        if not NUMPY_AVAILABLE:
            logger.warning("[SampleSelector] Running in DATABASE-ONLY mode (numpy unavailable)")
        elif not LIBROSA_AVAILABLE:
            logger.warning("[SampleSelector] Running in LIMITED mode (librosa unavailable)")
        else:
            logger.info("[SampleSelector] Running in FULL mode (numpy + librosa available)")

    def _build_index(self):
        """Build index from filesystem."""
        if self._indexed:
            return
        self._index = []
        if not self._library.is_dir():
            logger.warning("Library not found: %s", self._library)
            return

        for root, _dirs, files in os.walk(self._library):
            for f in files:
                if f.lower().endswith((".wav", ".aif", ".aiff", ".mp3", ".flac")):
                    fpath = os.path.join(root, f)
                    rel = os.path.relpath(root, str(self._library))
                    pack = rel.split(os.sep)[0] if rel else "unknown"
                    role = self._guess_role(f, rel)
                    self._index.append(SampleInfo(
                        name=f, path=fpath, role=role, pack=pack
                    ))
        self._indexed = True
        logger.info("Indexed %d samples from %s", len(self._index), self._library)

    def _guess_role(self, filename: str, relpath: str) -> str:
        """Guess sample role from filename and path."""
        lower = filename.lower()
        rel = relpath.lower()
        if "kick" in lower or "kick" in rel:
            return "kick"
        if "snare" in lower or "snare" in rel:
            return "snare"
        if "clap" in lower:
            return "clap"
        if "hi-hat" in rel or "hihat" in lower:
            return "hat_closed"
        if "bass" in lower or "bass" in rel:
            return "bass"
        if "fx" in lower or "fx" in rel:
            return "fx"
        if "perc" in lower or "perc" in rel:
            return "perc"
        if "drumloop" in rel:
            return "drum_loop"
        return "synth"

    def _get_samples(self, role: str, limit: int = 10) -> List[SampleInfo]:
        """Get samples by role from filesystem index."""
        self._build_index()
        dirs = _ROLE_MAP.get(role, [])
        results = [s for s in self._index if s.role == role or s.pack in dirs]
        return results[:limit]

    def select_samples_db_only(self, role, count=10, bpm_range=None, key=None):
        """Select samples using only database (no numpy/librosa).

        Args:
            role: Sample role (kick, snare, bass, etc.)
            count: Number of samples to return
            bpm_range: Optional (min, max) BPM range
            key: Optional musical key

        Returns:
            List of SampleInfo objects from database
        """
        if not self.metadata_store:
            logger.error("Metadata store not available")
            return []

        # Query database for samples matching criteria
        features_list = self.metadata_store.get_samples_by_category(role)

        # Filter by BPM range if specified
        if bpm_range and len(bpm_range) == 2:
            min_bpm, max_bpm = bpm_range
            features_list = [
                f for f in features_list
                if min_bpm <= f.bpm <= max_bpm
            ]

        # Filter by key if specified
        if key:
            features_list = [
                f for f in features_list
                if f.key == key
            ]

        # Convert to SampleInfo
        results = []
        for features in features_list[:count]:
            info = SampleInfo(
                path=features.path,
                name=os.path.basename(features.path),
                role=role,
                pack=os.path.basename(os.path.dirname(features.path)),
                key=features.key or "",
                bpm=features.bpm or 0.0
            )
            results.append(info)

        return results

    def _get_samples_librosa(self, role: str, count: int = 10, **kwargs) -> List[SampleInfo]:
        """Get samples using librosa audio analysis.

        This method requires numpy and librosa for audio feature extraction.
        Used as fallback when database has no cached samples.

        Args:
            role: Sample role (kick, snare, bass, etc.)
            count: Number of samples to return
            **kwargs: Additional filter parameters (target_bpm, target_key, etc.)

        Returns:
            List of SampleInfo objects from audio analysis
        """
        if not NUMPY_AVAILABLE or not LIBROSA_AVAILABLE:
            logger.error("Librosa analysis requested but numpy/librosa not available")
            return []

        # Get filesystem samples for this role
        fs_samples = self._get_samples(role, count * 2)
        results = []

        target_bpm = kwargs.get('target_bpm')
        target_key = kwargs.get('target_key')

        for sample in fs_samples:
            try:
                # Analyze audio with librosa
                features = self.extractor.extract(sample.path)
                if features:
                    # Filter by BPM if specified
                    if target_bpm and features.bpm:
                        if abs(features.bpm - target_bpm) > 10:
                            continue
                    # Filter by key if specified
                    if target_key and features.key:
                        if features.key != target_key:
                            continue

                    sample_info = SampleInfo.from_sample_features(features, role=role)
                    results.append(sample_info)
                else:
                    # Analysis failed, use filesystem sample with basic info
                    results.append(sample)
            except Exception as e:
                logger.warning(f"[SampleSelector] Librosa analysis failed for {sample.path}: {e}")
                results.append(sample)

            if len(results) >= count:
                break

        return results[:count]

    def get_samples_without_numpy(self, role: str, count: int = 10) -> List[SampleInfo]:
        """
        Get samples using only SQLite database, no audio analysis.

        This method works entirely without numpy/librosa by querying
        the pre-populated metadata database.

        Args:
            role: Sample role (kick, snare, bass, etc.)
            count: Number of samples to return

        Returns:
            List of SampleInfo objects from database
        """
        logger.info(f"[SampleSelector] Database-only query for role: {role}")

        # Map role to database category
        categories = _ROLE_MAP.get(role, [role])
        results = []

        # Search database for each category
        for category in categories:
            db_results = self.metadata_store.search_samples(
                category=category,
                limit=count
            )

            for features in db_results:
                sample_info = SampleInfo.from_sample_features(features, role=role)
                results.append(sample_info)

            if len(results) >= count:
                break

        # If no database results, fall back to filesystem
        if not results:
            logger.warning(f"[SampleSelector] No database results for {role}, using filesystem fallback")
            return self._get_samples(role, count)

        logger.info(f"[SampleSelector] Found {len(results[:count])} samples for {role} (database-only)")
        return results[:count]

    def select_by_similarity(self, reference_path: str, top_n: int = 10) -> InstrumentGroup:
        """Select samples similar to a reference audio file."""
        try:
            # Import here to avoid circular dependencies
            from . import embedding_engine as ee

            # Find similar samples using embeddings
            similar = ee.find_similar(reference_path, top_n=top_n * 3)

            if not similar:
                logger.warning("No similar samples found for %s, falling back to random", reference_path)
                return self.select_for_genre("reggaeton")

            # Build index if not already done
            self._build_index()

            # Get reference features using extractor (database-first, then analysis)
            ref_features = self.extractor.get_features(reference_path)
            ref_bpm = ref_features.get("bpm", 95.0) if ref_features else 95.0
            ref_key = ref_features.get("key", "Am") if ref_features else "Am"

            group = InstrumentGroup(genre="similar_to_reference", key=ref_key, bpm=ref_bpm)

            # Filter similar samples by role
            kick_samples = [s for s in similar if s.role == "kick"][:3]
            snare_samples = [s for s in similar if s.role in ("snare", "clap")][:3]
            hat_samples = [s for s in similar if s.role in ("hat_closed", "hat_open")][:3]
            bass_samples = [s for s in similar if s.role == "bass"][:5]
            synth_samples = [s for s in similar if s.role in ("synth", "oneshot")][:5]
            fx_samples = [s for s in similar if s.role == "fx"][:3]

            # Build drum kit
            group.drums = DrumKit(
                name="Similar Kit",
                kick=kick_samples[0] if kick_samples else None,
                snare=snare_samples[0] if snare_samples else None,
                clap=snare_samples[1] if len(snare_samples) > 1 else None,
                hat_closed=hat_samples[0] if hat_samples else None,
                hat_open=hat_samples[1] if len(hat_samples) > 1 else None,
            )

            # Fill other instruments
            group.bass = bass_samples
            group.synths = synth_samples
            group.fx = fx_samples

            logger.info("Selected %d similar samples for reference: %s",
                       len([x for x in [group.drums.kick, group.drums.snare] + group.bass + group.synths + group.fx if x]),
                       reference_path)

            return group

        except Exception as e:
            logger.error("Error in select_by_similarity: %s", str(e))
            return self.select_for_genre("reggaeton")

    def select_for_genre(
        self,
        genre: str,
        key: Optional[str] = None,
        bpm: Optional[float] = None
    ) -> InstrumentGroup:
        """
        Select a complete sample pack for the given genre.

        Uses database-first approach: queries SQLite for cached samples,
        only analyzing new samples if numpy is available.

        Args:
            genre: Genre to select samples for
            key: Musical key (default: Am)
            bpm: Tempo in BPM (default: 95.0)

        Returns:
            InstrumentGroup with selected samples
        """
        self._build_index()
        if not self._index:
            raise ValueError("No samples found in %s" % self._library)

        group = InstrumentGroup(genre=genre, key=key or "Am", bpm=bpm or 95.0)

        # Try database-first for each role, fallback to filesystem
        if isinstance(self.extractor, DatabaseExtractor) or not NUMPY_AVAILABLE:
            # Database-only mode
            logger.info("[SampleSelector] Using database-only selection")
            kick = self.get_samples_without_numpy("kick", 3)
            snare = self.get_samples_without_numpy("snare", 3)
            clap = self.get_samples_without_numpy("clap", 2)
            hats = self.get_samples_without_numpy("hat_closed", 4)
            bass = self.get_samples_without_numpy("bass", 5)
            synths = self.get_samples_without_numpy("synth", 5)
            fx = self.get_samples_without_numpy("fx", 3)
        else:
            # Hybrid mode: database first, then analyze uncached samples
            logger.info("[SampleSelector] Using hybrid selection (database + analysis)")

            kick = self._get_samples_hybrid("kick", 3)
            snare = self._get_samples_hybrid("snare", 3)
            clap = self._get_samples_hybrid("clap", 2)
            hats = self._get_samples_hybrid("hat_closed", 4)
            bass = self._get_samples_hybrid("bass", 5)
            synths = self._get_samples_hybrid("synth", 5)
            fx = self._get_samples_hybrid("fx", 3)

        # Build drum kit
        group.drums = DrumKit(
            name="%s Kit" % genre.title(),
            kick=kick[0] if kick else None,
            snare=snare[0] if snare else None,
            clap=clap[0] if clap else (snare[1] if len(snare) > 1 else None),
            hat_closed=hats[0] if hats else None,
            hat_open=hats[1] if len(hats) > 1 else None,
        )

        # Fill other instruments
        group.bass = bass
        group.synths = synths
        group.fx = fx

        return group

    def _get_samples_hybrid(self, role: str, count: int) -> List[SampleInfo]:
        """
        Get samples using hybrid approach: database first, analyze if needed.

        Args:
            role: Sample role
            count: Number of samples needed

        Returns:
            List of SampleInfo objects
        """
        results = []

        # Get filesystem samples for this role
        fs_samples = self._get_samples(role, count * 2)

        for sample in fs_samples:
            # Try database first
            db_features = self.metadata_store.get_sample_features(sample.path)

            if db_features:
                # Cache hit - use database result
                sample_info = SampleInfo.from_sample_features(db_features, role=role)
                results.append(sample_info)
            elif NUMPY_AVAILABLE and LIBROSA_AVAILABLE:
                # Cache miss - analyze and cache
                try:
                    features = self.extractor.extract(sample.path)
                    if features:
                        sample_info = SampleInfo.from_sample_features(features, role=role)
                        results.append(sample_info)
                    else:
                        # Analysis failed, use filesystem sample
                        results.append(sample)
                except Exception as e:
                    logger.warning(f"[SampleSelector] Analysis failed for {sample.path}: {e}")
                    results.append(sample)
            else:
                # No numpy available, use filesystem sample
                results.append(sample)

            if len(results) >= count:
                break

        return results[:count]

    def get_recommended_samples(self, role, count=10, **kwargs):
        """Get recommended samples with database-first approach."""
        # Try database first
        if self.metadata_store:
            target_bpm = kwargs.get('target_bpm')
            target_key = kwargs.get('target_key')

            bpm_range = None
            if target_bpm:
                bpm_range = (target_bpm - 5, target_bpm + 5)

            db_results = self.select_samples_db_only(role, count, bpm_range=bpm_range, key=target_key)
            if db_results:
                logger.info(f"Retrieved {len(db_results)} samples from database")
                return db_results

        # Fall back to legacy analysis if numpy available
        if NUMPY_AVAILABLE and LIBROSA_AVAILABLE:
            logger.info("Using librosa analysis for samples")
            return self._get_samples_librosa(role, count, **kwargs)

        # Limited mode: return empty with warning
        logger.warning("No metadata store and no numpy - cannot select samples")
        return []


# Global instance
_selector: Optional[SampleSelector] = None


def get_selector(
    library_path: Optional[str] = None,
    metadata_store: Optional[SampleMetadataStore] = None
) -> SampleSelector:
    """
    Get global SampleSelector instance.

    Args:
        library_path: Optional library path
        metadata_store: Optional metadata store

    Returns:
        SampleSelector singleton
    """
    global _selector
    if _selector is None:
        _selector = SampleSelector(library_path, metadata_store)
    return _selector


def select_samples_for_track(
    genre: str,
    key: str = "",
    bpm: float = 0,
    metadata_store: Optional[SampleMetadataStore] = None
) -> InstrumentGroup:
    """
    Convenience function: select samples for a genre.

    Args:
        genre: Genre to select
        key: Musical key
        bpm: Tempo in BPM
        metadata_store: Optional metadata store

    Returns:
        InstrumentGroup with selected samples
    """
    return get_selector(metadata_store=metadata_store).select_for_genre(
        genre,
        key if key else None,
        bpm if bpm > 0 else None
    )


def get_drum_kit(
    genre: str = "reggaeton",
    variation: str = "standard",
    metadata_store: Optional[SampleMetadataStore] = None
) -> DrumKit:
    """
    Get a drum kit for the genre.

    Args:
        genre: Genre for drum kit
        variation: Kit variation style
        metadata_store: Optional metadata store

    Returns:
        DrumKit with selected samples
    """
    group = get_selector(metadata_store=metadata_store).select_for_genre(genre)
    return group.drums


def get_recommended_samples(
    role: str,
    count: int = 5,
    target_bpm: Optional[float] = None,
    target_key: Optional[str] = None,
    metadata_store: Optional[SampleMetadataStore] = None
) -> List[SampleInfo]:
    """
    Get recommended samples for a role from metadata store.

    Args:
        role: Sample role/category
        count: Number of samples
        target_bpm: Optional BPM target
        target_key: Optional key target
        metadata_store: Optional metadata store

    Returns:
        List of recommended SampleInfo objects
    """
    return get_selector(metadata_store=metadata_store).get_recommended_samples(
        role=role,
        count=count,
        target_bpm=target_bpm,
        target_key=target_key
    )


def reset_cross_generation_memory():
    """Reset selection memory (compatibility stub)."""
    pass


def get_extraction_mode() -> str:
    """
    Get current extraction mode for debugging.

    Returns:
        Mode string: "full_analysis", "limited_analysis", "database_only", etc.
    """
    selector = get_selector()
    return selector.extraction_mode


def is_numpy_available() -> bool:
    """Check if numpy is available for analysis."""
    return NUMPY_AVAILABLE


def is_librosa_available() -> bool:
    """Check if librosa is available for analysis."""
    return LIBROSA_AVAILABLE