import os import json import logging from pathlib import Path from typing import List, Dict, Tuple try: from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import numpy as np HAS_ML = True except ImportError: HAS_ML = False logger = logging.getLogger("VectorManager") logging.basicConfig(level=logging.INFO) class VectorManager: def __init__(self, library_dir: str): self.library_dir = Path(library_dir) self.index_file = self.library_dir / ".sample_embeddings.json" self.model = None self.embeddings = [] self.metadata = [] if HAS_ML: try: # Load a very lightweight model for fast embeddings logger.info("Loading sentence-transformers model (all-MiniLM-L6-v2)...") self.model = SentenceTransformer('all-MiniLM-L6-v2') except Exception as e: logger.error(f"Failed to load embedding model: {e}") self._load_or_build_index() def _load_or_build_index(self): if self.index_file.exists(): logger.info("Loading existing vector index...") try: with open(self.index_file, 'r', encoding='utf-8') as f: data = json.load(f) self.metadata = data.get('metadata', []) if HAS_ML and 'embeddings' in data: self.embeddings = np.array(data['embeddings']) else: logger.warning("No embeddings found in loaded index.") except Exception as e: logger.error(f"Failed to load index: {e}") self._build_index() else: self._build_index() def _build_index(self): logger.info(f"Scanning library {self.library_dir} for new embeddings...") extensions = {'.wav', '.aif', '.aiff', '.mp3'} files_to_process = [] for ext in extensions: files_to_process.extend(self.library_dir.rglob('*' + ext)) files_to_process.extend(self.library_dir.rglob('*' + ext.upper())) if not files_to_process: logger.warning(f"No audio files found in {self.library_dir} to embed.") return texts_to_embed = [] self.metadata = [] for f in set(files_to_process): # Clean up the name for better semantic understanding name = f.stem clean_name = name.replace('_', ' ').replace('-', ' ').lower() # Use relative path as part of the context since folders represent duration and type try: rel_path = f.relative_to(self.library_dir) parts = rel_path.parts[:-1] path_context = " ".join(parts).lower() except ValueError: path_context = "" description = f"{clean_name} {path_context}" texts_to_embed.append(description) self.metadata.append({ 'path': str(f), 'name': name, 'description': description }) if HAS_ML and self.model: logger.info(f"Generating vectors for {len(texts_to_embed)} samples. This might take a moment...") embeddings = self.model.encode(texts_to_embed) self.embeddings = embeddings # Save the vectors with open(self.index_file, 'w', encoding='utf-8') as f: json.dump({ 'metadata': self.metadata, 'embeddings': embeddings.tolist() }, f) logger.info(f"Saved {len(self.metadata)} embeddings to {self.index_file}.") else: logger.error("ML libraries not installed. Run 'pip install sentence-transformers scikit-learn numpy'") def semantic_search(self, query: str, limit: int = 5) -> List[Dict]: """ Returns a list of metadata dicts sorted by semantic relevance down to the limit. Fallback to basic substring matching if ML is unavailable. """ if not HAS_ML or self.model is None or len(self.embeddings) == 0: logger.warning("ML unavailable, falling back to substring search.") return self._fallback_search(query, limit) logger.info(f"Performing semantic search for: '{query}'") query_emb = self.model.encode([query]) # Calculate cosine similarity between query and all stored embeddings similarities = cosine_similarity(query_emb, self.embeddings)[0] # Get top indices top_indices = np.argsort(similarities)[::-1][:limit] results = [] for idx in top_indices: score = float(similarities[idx]) meta = self.metadata[idx].copy() meta['score'] = score results.append(meta) return results def _fallback_search(self, query: str, limit: int = 5) -> List[Dict]: query = query.lower() scored = [] for m in self.metadata: score = 0 if query in m['name'].lower(): score += 10 if query in m['description'].lower(): score += 5 if score > 0: scored.append((score, m)) scored.sort(key=lambda x: x[0], reverse=True) return [m for s, m in scored[:limit]] if __name__ == "__main__": import sys if len(sys.argv) > 1: path = sys.argv[1] vm = VectorManager(path) if len(sys.argv) > 2: query = sys.argv[2] res = vm.semantic_search(query) print("Search Results for", query) for r in res: print(r['score'], r['name'], r['path']) else: print("Usage: python vector_manager.py [search_query]")