feat: Implement senior audio injection with 5 fallback methods
- Add _cmd_create_arrangement_audio_pattern with 5-method fallback chain - Method 1: track.insert_arrangement_clip() [Live 12+] - Method 2: track.create_audio_clip() [Live 11+] - Method 3: arrangement_clips.add_new_clip() [Live 12+] - Method 4: Session->duplicate_clip_to_arrangement [Legacy] - Method 5: Session->Recording [Universal] - Add _cmd_duplicate_clip_to_arrangement for session-to-arrangement workflow - Update skills documentation - Verified: 3 clips created at positions [0, 4, 8] in Arrangement View Closes: Audio injection in Arrangement View
This commit is contained in:
840
mcp_server/engines/coherence_scorer.py
Normal file
840
mcp_server/engines/coherence_scorer.py
Normal file
@@ -0,0 +1,840 @@
|
||||
"""
|
||||
CoherenceScorer - Advanced Coherence Calculation Engine
|
||||
|
||||
Calculates multi-dimensional coherence scores between audio samples using
|
||||
timbre similarity (MFCC), transient compatibility, spectral balance, and
|
||||
energy consistency.
|
||||
|
||||
Professional-grade tool with 0.90 threshold enforcement.
|
||||
|
||||
File: AbletonMCP_AI/mcp_server/engines/coherence_scorer.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class CoherenceError(Exception):
|
||||
"""Raised when coherence score falls below professional threshold."""
|
||||
|
||||
def __init__(self, score: float, weak_components: List[str], suggestions: List[str]):
|
||||
self.score = score
|
||||
self.weak_components = weak_components
|
||||
self.suggestions = suggestions
|
||||
super().__init__(self._format_message())
|
||||
|
||||
def _format_message(self) -> str:
|
||||
msg = f"\n{'='*60}\n"
|
||||
msg += f"COHERENCE ERROR: Professional threshold not met\n"
|
||||
msg += f"{'='*60}\n"
|
||||
msg += f"Current Score: {self.score:.3f} (MIN_COHERENCE: 0.900)\n"
|
||||
msg += f"Status: {'PASS ✓' if self.score >= 0.90 else 'FAIL ✗'}\n\n"
|
||||
|
||||
if self.weak_components:
|
||||
msg += f"Weak Components ({len(self.weak_components)}):\n"
|
||||
for comp in self.weak_components:
|
||||
msg += f" • {comp}\n"
|
||||
|
||||
if self.suggestions:
|
||||
msg += f"\nSuggestions for Improvement:\n"
|
||||
for i, sug in enumerate(self.suggestions, 1):
|
||||
msg += f" {i}. {sug}\n"
|
||||
|
||||
msg += f"{'='*60}\n"
|
||||
return msg
|
||||
|
||||
|
||||
@dataclass
|
||||
class AudioFeatures:
|
||||
"""Container for extracted audio features."""
|
||||
mfccs: np.ndarray # MFCC coefficients (timbre)
|
||||
spectral_centroid: float # Brightness
|
||||
spectral_rolloff: float # Bandwidth
|
||||
spectral_flux: np.ndarray # Spectral change (transients)
|
||||
zero_crossing_rate: float # Noisiness
|
||||
rms_energy: np.ndarray # Loudness envelope
|
||||
attack_time: float # Transient attack
|
||||
sustain_level: float # Sustain level
|
||||
low_energy: float # Low band energy (20-250Hz)
|
||||
mid_energy: float # Mid band energy (250-2000Hz)
|
||||
high_energy: float # High band energy (2000-20000Hz)
|
||||
duration: float # Audio duration in seconds
|
||||
sample_rate: int # Sample rate
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScoreBreakdown:
|
||||
"""Detailed breakdown of coherence score components."""
|
||||
overall_score: float
|
||||
timbre_similarity: float # MFCC cosine similarity (40%)
|
||||
transient_compatibility: float # Attack characteristic match (30%)
|
||||
spectral_balance: float # Low/mid/high ratio match (20%)
|
||||
energy_consistency: float # RMS correlation (10%)
|
||||
is_professional: bool
|
||||
weak_components: List[str]
|
||||
suggestions: List[str]
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
'overall_score': round(self.overall_score, 4),
|
||||
'timbre_similarity': round(self.timbre_similarity, 4),
|
||||
'transient_compatibility': round(self.transient_compatibility, 4),
|
||||
'spectral_balance': round(self.spectral_balance, 4),
|
||||
'energy_consistency': round(self.energy_consistency, 4),
|
||||
'is_professional': self.is_professional,
|
||||
'weak_components': self.weak_components,
|
||||
'suggestions': self.suggestions
|
||||
}
|
||||
|
||||
|
||||
class CoherenceScorer:
|
||||
"""
|
||||
Professional coherence calculation engine.
|
||||
|
||||
Calculates multi-dimensional coherence scores between audio samples
|
||||
using real audio feature extraction and weighted component analysis.
|
||||
|
||||
Weights:
|
||||
- Timbre similarity (MFCC): 40%
|
||||
- Transient compatibility: 30%
|
||||
- Spectral balance: 20%
|
||||
- Energy consistency: 10%
|
||||
|
||||
Professional threshold: 0.90 (MIN_COHERENCE)
|
||||
"""
|
||||
|
||||
# Professional threshold - no compromise
|
||||
MIN_COHERENCE = 0.90
|
||||
|
||||
# Component weights (must sum to 1.0)
|
||||
WEIGHTS = {
|
||||
'timbre': 0.40,
|
||||
'transient': 0.30,
|
||||
'spectral': 0.20,
|
||||
'energy': 0.10
|
||||
}
|
||||
|
||||
# Thresholds for component quality
|
||||
THRESHOLDS = {
|
||||
'timbre': 0.75,
|
||||
'transient': 0.70,
|
||||
'spectral': 0.65,
|
||||
'energy': 0.60
|
||||
}
|
||||
|
||||
def __init__(self, sample_rate: int = 22050):
|
||||
"""
|
||||
Initialize the CoherenceScorer.
|
||||
|
||||
Args:
|
||||
sample_rate: Target sample rate for analysis (default 22050)
|
||||
"""
|
||||
self.sample_rate = sample_rate
|
||||
self.last_breakdown: Optional[ScoreBreakdown] = None
|
||||
|
||||
def _load_audio(self, file_path: str) -> Tuple[np.ndarray, int]:
|
||||
"""
|
||||
Load audio file using librosa.
|
||||
|
||||
Args:
|
||||
file_path: Path to audio file (.wav, .mp3, etc.)
|
||||
|
||||
Returns:
|
||||
Tuple of (audio_array, sample_rate)
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If file doesn't exist
|
||||
ValueError: If file format unsupported or corrupted
|
||||
"""
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"librosa is required for audio analysis. "
|
||||
"Install with: pip install librosa"
|
||||
)
|
||||
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Audio file not found: {file_path}")
|
||||
|
||||
if not path.suffix.lower() in ['.wav', '.mp3', '.aif', '.aiff', '.flac']:
|
||||
raise ValueError(f"Unsupported audio format: {path.suffix}")
|
||||
|
||||
try:
|
||||
y, sr = librosa.load(file_path, sr=self.sample_rate, mono=True)
|
||||
if len(y) == 0:
|
||||
raise ValueError(f"Audio file is empty: {file_path}")
|
||||
return y, sr
|
||||
except Exception as e:
|
||||
raise ValueError(f"Failed to load audio file {file_path}: {str(e)}")
|
||||
|
||||
def _extract_features(self, audio: np.ndarray, sr: int) -> AudioFeatures:
|
||||
"""
|
||||
Extract comprehensive audio features.
|
||||
|
||||
Args:
|
||||
audio: Audio time series
|
||||
sr: Sample rate
|
||||
|
||||
Returns:
|
||||
AudioFeatures dataclass with all extracted features
|
||||
"""
|
||||
import librosa
|
||||
|
||||
# Basic spectral features
|
||||
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
|
||||
spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr))
|
||||
spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr))
|
||||
spectral_flux = librosa.onset.onset_strength(y=audio, sr=sr)
|
||||
zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
|
||||
rms = librosa.feature.rms(y=audio)[0]
|
||||
|
||||
# Band energy analysis
|
||||
# Low: 20-250Hz, Mid: 250-2000Hz, High: 2000-20000Hz
|
||||
stft = np.abs(librosa.stft(audio))
|
||||
freqs = librosa.fft_frequencies(sr=sr)
|
||||
|
||||
low_mask = (freqs >= 20) & (freqs <= 250)
|
||||
mid_mask = (freqs > 250) & (freqs <= 2000)
|
||||
high_mask = (freqs > 2000) & (freqs <= 20000)
|
||||
|
||||
low_energy = np.sum(stft[low_mask, :]) / stft.shape[1]
|
||||
mid_energy = np.sum(stft[mid_mask, :]) / stft.shape[1]
|
||||
high_energy = np.sum(stft[high_mask, :]) / stft.shape[1]
|
||||
|
||||
# Normalize band energies
|
||||
total_energy = low_energy + mid_energy + high_energy
|
||||
if total_energy > 0:
|
||||
low_energy /= total_energy
|
||||
mid_energy /= total_energy
|
||||
high_energy /= total_energy
|
||||
|
||||
# Transient analysis (attack detection)
|
||||
onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
|
||||
onset_frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr)
|
||||
|
||||
if len(onset_frames) > 0:
|
||||
# Calculate average attack time from first transient
|
||||
first_onset = onset_frames[0]
|
||||
window_start = max(0, first_onset - 10)
|
||||
window_end = min(len(audio), first_onset + 50)
|
||||
|
||||
if window_end > window_start:
|
||||
attack_segment = audio[window_start:window_end]
|
||||
# Attack time: time from 10% to 90% of peak
|
||||
peak_idx = np.argmax(np.abs(attack_segment))
|
||||
peak_val = np.abs(attack_segment[peak_idx])
|
||||
|
||||
if peak_val > 0:
|
||||
# Find 10% and 90% points
|
||||
ten_percent = 0.1 * peak_val
|
||||
ninety_percent = 0.9 * peak_val
|
||||
|
||||
ten_idx = np.where(np.abs(attack_segment[:peak_idx]) >= ten_percent)[0]
|
||||
ninety_idx = np.where(np.abs(attack_segment[:peak_idx]) >= ninety_percent)[0]
|
||||
|
||||
if len(ten_idx) > 0 and len(ninety_idx) > 0:
|
||||
attack_time = (ninety_idx[0] - ten_idx[0]) / sr * 1000 # ms
|
||||
else:
|
||||
attack_time = 10.0 # Default 10ms
|
||||
else:
|
||||
attack_time = 10.0
|
||||
|
||||
# Sustain level: average after attack
|
||||
sustain_start = peak_idx + int(0.01 * sr) # 10ms after peak
|
||||
if sustain_start < len(attack_segment):
|
||||
sustain_level = np.mean(np.abs(attack_segment[sustain_start:]))
|
||||
else:
|
||||
sustain_level = 0.0
|
||||
else:
|
||||
attack_time = 10.0
|
||||
sustain_level = np.mean(np.abs(audio)) * 0.5
|
||||
else:
|
||||
attack_time = 50.0 # Long attack for non-transient sounds
|
||||
sustain_level = np.mean(np.abs(audio))
|
||||
|
||||
return AudioFeatures(
|
||||
mfccs=mfccs,
|
||||
spectral_centroid=spectral_centroid,
|
||||
spectral_rolloff=spectral_rolloff,
|
||||
spectral_flux=spectral_flux,
|
||||
zero_crossing_rate=zcr,
|
||||
rms_energy=rms,
|
||||
attack_time=attack_time,
|
||||
sustain_level=float(sustain_level),
|
||||
low_energy=float(low_energy),
|
||||
mid_energy=float(mid_energy),
|
||||
high_energy=float(high_energy),
|
||||
duration=len(audio) / sr,
|
||||
sample_rate=sr
|
||||
)
|
||||
|
||||
def _calculate_timbre_similarity(self, feat1: AudioFeatures, feat2: AudioFeatures) -> float:
|
||||
"""
|
||||
Calculate timbre similarity using MFCC cosine similarity.
|
||||
|
||||
Uses mean MFCC vectors and accounts for temporal evolution.
|
||||
|
||||
Args:
|
||||
feat1: Features from first sample
|
||||
feat2: Features from second sample
|
||||
|
||||
Returns:
|
||||
Similarity score 0.0-1.0
|
||||
"""
|
||||
# Mean MFCC vectors
|
||||
mfcc1_mean = np.mean(feat1.mfccs, axis=1)
|
||||
mfcc2_mean = np.mean(feat2.mfccs, axis=1)
|
||||
|
||||
# Cosine similarity
|
||||
dot_product = np.dot(mfcc1_mean, mfcc2_mean)
|
||||
norm1 = np.linalg.norm(mfcc1_mean)
|
||||
norm2 = np.linalg.norm(mfcc2_mean)
|
||||
|
||||
if norm1 == 0 or norm2 == 0:
|
||||
return 0.0
|
||||
|
||||
cosine_sim = dot_product / (norm1 * norm2)
|
||||
|
||||
# Convert from [-1, 1] to [0, 1]
|
||||
similarity = (cosine_sim + 1) / 2
|
||||
|
||||
# Also compare spectral centroid (brightness match)
|
||||
centroid_diff = abs(feat1.spectral_centroid - feat2.spectral_centroid)
|
||||
max_centroid = max(feat1.spectral_centroid, feat2.spectral_centroid)
|
||||
if max_centroid > 0:
|
||||
centroid_sim = 1 - (centroid_diff / max_centroid)
|
||||
else:
|
||||
centroid_sim = 1.0
|
||||
|
||||
# Weighted combination: 80% MFCC, 20% centroid
|
||||
final_similarity = 0.8 * similarity + 0.2 * centroid_sim
|
||||
|
||||
return float(np.clip(final_similarity, 0.0, 1.0))
|
||||
|
||||
def _calculate_transient_compatibility(self, feat1: AudioFeatures, feat2: AudioFeatures) -> float:
|
||||
"""
|
||||
Calculate transient/attack characteristic compatibility.
|
||||
|
||||
Compares attack times, sustain levels, and spectral flux patterns.
|
||||
|
||||
Args:
|
||||
feat1: Features from first sample
|
||||
feat2: Features from second sample
|
||||
|
||||
Returns:
|
||||
Compatibility score 0.0-1.0
|
||||
"""
|
||||
# Attack time compatibility
|
||||
attack_diff = abs(feat1.attack_time - feat2.attack_time)
|
||||
max_attack = max(feat1.attack_time, feat2.attack_time, 1.0)
|
||||
attack_compatibility = 1 - (attack_diff / max_attack)
|
||||
|
||||
# Sustain level compatibility
|
||||
max_sustain = max(feat1.sustain_level, feat2.sustain_level, 0.001)
|
||||
sustain_diff = abs(feat1.sustain_level - feat2.sustain_level)
|
||||
sustain_compatibility = 1 - (sustain_diff / max_sustain)
|
||||
|
||||
# Spectral flux pattern correlation
|
||||
flux1 = feat1.spectral_flux
|
||||
flux2 = feat2.spectral_flux
|
||||
|
||||
# Normalize lengths
|
||||
min_len = min(len(flux1), len(flux2))
|
||||
if min_len > 1:
|
||||
flux1_norm = flux1[:min_len]
|
||||
flux2_norm = flux2[:min_len]
|
||||
|
||||
# Normalize to unit vectors
|
||||
flux1_norm = flux1_norm / (np.linalg.norm(flux1_norm) + 1e-10)
|
||||
flux2_norm = flux2_norm / (np.linalg.norm(flux2_norm) + 1e-10)
|
||||
|
||||
flux_corr = np.corrcoef(flux1_norm, flux2_norm)[0, 1]
|
||||
if np.isnan(flux_corr):
|
||||
flux_corr = 0.0
|
||||
else:
|
||||
flux_corr = 0.5
|
||||
|
||||
# Weighted combination
|
||||
# Attack: 40%, Sustain: 30%, Flux correlation: 30%
|
||||
compatibility = (
|
||||
0.4 * attack_compatibility +
|
||||
0.3 * sustain_compatibility +
|
||||
0.3 * max(0, flux_corr) # Clip negative correlations
|
||||
)
|
||||
|
||||
return float(np.clip(compatibility, 0.0, 1.0))
|
||||
|
||||
def _calculate_spectral_balance(self, feat1: AudioFeatures, feat2: AudioFeatures) -> float:
|
||||
"""
|
||||
Calculate spectral balance match (low/mid/high ratio comparison).
|
||||
|
||||
Args:
|
||||
feat1: Features from first sample
|
||||
feat2: Features from second sample
|
||||
|
||||
Returns:
|
||||
Balance score 0.0-1.0
|
||||
"""
|
||||
# Energy band ratios
|
||||
bands1 = np.array([feat1.low_energy, feat1.mid_energy, feat1.high_energy])
|
||||
bands2 = np.array([feat2.low_energy, feat2.mid_energy, feat2.high_energy])
|
||||
|
||||
# Cosine similarity of band distributions
|
||||
dot = np.dot(bands1, bands2)
|
||||
norm1 = np.linalg.norm(bands1)
|
||||
norm2 = np.linalg.norm(bands2)
|
||||
|
||||
if norm1 == 0 or norm2 == 0:
|
||||
return 0.5
|
||||
|
||||
balance_sim = dot / (norm1 * norm2)
|
||||
|
||||
# Also compare rolloff (high-frequency content boundary)
|
||||
rolloff_diff = abs(feat1.spectral_rolloff - feat2.spectral_rolloff)
|
||||
max_rolloff = max(feat1.spectral_rolloff, feat2.spectral_rolloff, 1.0)
|
||||
rolloff_sim = 1 - (rolloff_diff / max_rolloff)
|
||||
|
||||
# Combined: 70% band balance, 30% rolloff match
|
||||
final_balance = 0.7 * balance_sim + 0.3 * rolloff_sim
|
||||
|
||||
return float(np.clip(final_balance, 0.0, 1.0))
|
||||
|
||||
def _calculate_energy_consistency(self, feat1: AudioFeatures, feat2: AudioFeatures) -> float:
|
||||
"""
|
||||
Calculate energy envelope consistency.
|
||||
|
||||
Compares RMS energy patterns and overall loudness.
|
||||
|
||||
Args:
|
||||
feat1: Features from first sample
|
||||
feat2: Features from second sample
|
||||
|
||||
Returns:
|
||||
Consistency score 0.0-1.0
|
||||
"""
|
||||
rms1 = feat1.rms_energy
|
||||
rms2 = feat2.rms_energy
|
||||
|
||||
# Match lengths
|
||||
min_len = min(len(rms1), len(rms2))
|
||||
if min_len < 2:
|
||||
return 0.5
|
||||
|
||||
rms1_norm = rms1[:min_len]
|
||||
rms2_norm = rms2[:min_len]
|
||||
|
||||
# Normalize
|
||||
max_rms1 = np.max(rms1_norm) + 1e-10
|
||||
max_rms2 = np.max(rms2_norm) + 1e-10
|
||||
|
||||
rms1_norm = rms1_norm / max_rms1
|
||||
rms2_norm = rms2_norm / max_rms2
|
||||
|
||||
# Correlation of energy envelopes
|
||||
corr = np.corrcoef(rms1_norm, rms2_norm)[0, 1]
|
||||
if np.isnan(corr):
|
||||
corr = 0.0
|
||||
|
||||
# Mean energy similarity
|
||||
mean1 = np.mean(feat1.rms_energy)
|
||||
mean2 = np.mean(feat2.rms_energy)
|
||||
max_mean = max(mean1, mean2, 0.001)
|
||||
mean_sim = 1 - (abs(mean1 - mean2) / max_mean)
|
||||
|
||||
# Combined: 60% correlation, 40% mean level
|
||||
consistency = 0.6 * max(0, corr) + 0.4 * mean_sim
|
||||
|
||||
return float(np.clip(consistency, 0.0, 1.0))
|
||||
|
||||
def score_pair(self, sample1_path: str, sample2_path: str, enforce_threshold: bool = True) -> float:
|
||||
"""
|
||||
Calculate coherence score between two samples.
|
||||
|
||||
Args:
|
||||
sample1_path: Path to first audio file
|
||||
sample2_path: Path to second audio file
|
||||
enforce_threshold: If True, raises CoherenceError if score < 0.90
|
||||
|
||||
Returns:
|
||||
Overall coherence score (0.0-1.0)
|
||||
|
||||
Raises:
|
||||
CoherenceError: If score < MIN_COHERENCE and enforce_threshold=True
|
||||
FileNotFoundError: If audio files not found
|
||||
ValueError: If audio loading fails
|
||||
"""
|
||||
# Load and extract features
|
||||
audio1, sr1 = self._load_audio(sample1_path)
|
||||
audio2, sr2 = self._load_audio(sample2_path)
|
||||
|
||||
feat1 = self._extract_features(audio1, sr1)
|
||||
feat2 = self._extract_features(audio2, sr2)
|
||||
|
||||
# Calculate component scores
|
||||
timbre_score = self._calculate_timbre_similarity(feat1, feat2)
|
||||
transient_score = self._calculate_transient_compatibility(feat1, feat2)
|
||||
spectral_score = self._calculate_spectral_balance(feat1, feat2)
|
||||
energy_score = self._calculate_energy_consistency(feat1, feat2)
|
||||
|
||||
# Calculate weighted overall score
|
||||
overall_score = (
|
||||
self.WEIGHTS['timbre'] * timbre_score +
|
||||
self.WEIGHTS['transient'] * transient_score +
|
||||
self.WEIGHTS['spectral'] * spectral_score +
|
||||
self.WEIGHTS['energy'] * energy_score
|
||||
)
|
||||
|
||||
# Identify weak components
|
||||
weak_components = []
|
||||
suggestions = []
|
||||
|
||||
scores = {
|
||||
'timbre_similarity': timbre_score,
|
||||
'transient_compatibility': transient_score,
|
||||
'spectral_balance': spectral_score,
|
||||
'energy_consistency': energy_score
|
||||
}
|
||||
|
||||
for component, score in scores.items():
|
||||
threshold = self.THRESHOLDS.get(component.replace('_similarity', 'timbre')
|
||||
.replace('_compatibility', 'transient')
|
||||
.replace('_balance', 'spectral')
|
||||
.replace('_consistency', 'energy'), 0.6)
|
||||
if score < threshold:
|
||||
weak_components.append(f"{component}: {score:.3f} (threshold: {threshold:.2f})")
|
||||
|
||||
# Add specific suggestions
|
||||
if 'timbre' in component:
|
||||
suggestions.append(
|
||||
"Consider samples from the same source/pack for timbral consistency. "
|
||||
"Try layering with a shared reverb bus."
|
||||
)
|
||||
elif 'transient' in component:
|
||||
suggestions.append(
|
||||
"Adjust transient timing with warp markers or apply transient shaping. "
|
||||
"Samples have different attack characteristics."
|
||||
)
|
||||
elif 'spectral' in component:
|
||||
suggestions.append(
|
||||
"Use EQ to match frequency profiles. "
|
||||
"Check if samples occupy different frequency ranges."
|
||||
)
|
||||
elif 'energy' in component:
|
||||
suggestions.append(
|
||||
"Adjust clip gain to match perceived loudness. "
|
||||
"Apply compression for consistent dynamics."
|
||||
)
|
||||
|
||||
# Create breakdown
|
||||
self.last_breakdown = ScoreBreakdown(
|
||||
overall_score=overall_score,
|
||||
timbre_similarity=timbre_score,
|
||||
transient_compatibility=transient_score,
|
||||
spectral_balance=spectral_score,
|
||||
energy_consistency=energy_score,
|
||||
is_professional=overall_score >= self.MIN_COHERENCE,
|
||||
weak_components=weak_components,
|
||||
suggestions=list(set(suggestions)) # Remove duplicates
|
||||
)
|
||||
|
||||
# Enforce professional threshold
|
||||
if enforce_threshold and overall_score < self.MIN_COHERENCE:
|
||||
raise CoherenceError(overall_score, weak_components, suggestions)
|
||||
|
||||
return overall_score
|
||||
|
||||
def score_kit(self, sample_paths: List[str], enforce_threshold: bool = True) -> float:
|
||||
"""
|
||||
Calculate overall kit coherence (average of all pairwise scores).
|
||||
|
||||
Args:
|
||||
sample_paths: List of audio file paths
|
||||
enforce_threshold: If True, raises CoherenceError if score < 0.90
|
||||
|
||||
Returns:
|
||||
Kit coherence score (0.0-1.0)
|
||||
|
||||
Raises:
|
||||
CoherenceError: If score < MIN_COHERENCE and enforce_threshold=True
|
||||
ValueError: If fewer than 2 samples provided
|
||||
"""
|
||||
if len(sample_paths) < 2:
|
||||
raise ValueError("Need at least 2 samples to calculate kit coherence")
|
||||
|
||||
# Calculate all pairwise scores
|
||||
scores = []
|
||||
pair_details = []
|
||||
|
||||
for i in range(len(sample_paths)):
|
||||
for j in range(i + 1, len(sample_paths)):
|
||||
try:
|
||||
score = self.score_pair(
|
||||
sample_paths[i],
|
||||
sample_paths[j],
|
||||
enforce_threshold=False # Don't raise until we check all
|
||||
)
|
||||
scores.append(score)
|
||||
pair_details.append({
|
||||
'pair': (Path(sample_paths[i]).name, Path(sample_paths[j]).name),
|
||||
'score': score
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not compare {sample_paths[i]} vs {sample_paths[j]}: {e}")
|
||||
scores.append(0.0)
|
||||
|
||||
if not scores:
|
||||
raise ValueError("No valid pairwise comparisons could be made")
|
||||
|
||||
# Average score
|
||||
kit_score = np.mean(scores)
|
||||
|
||||
# Find worst pairs
|
||||
sorted_pairs = sorted(pair_details, key=lambda x: x['score'])
|
||||
weak_pairs = [p for p in sorted_pairs if p['score'] < 0.75]
|
||||
|
||||
# Build suggestions
|
||||
suggestions = []
|
||||
if weak_pairs:
|
||||
worst = weak_pairs[:3] # Top 3 worst
|
||||
suggestions.append(
|
||||
f"{len(weak_pairs)} weak pair(s) detected. "
|
||||
f"Worst: {worst[0]['pair']} = {worst[0]['score']:.3f}"
|
||||
)
|
||||
suggestions.append(
|
||||
"Consider replacing or processing weak pairs for better cohesion."
|
||||
)
|
||||
|
||||
self.last_breakdown = ScoreBreakdown(
|
||||
overall_score=kit_score,
|
||||
timbre_similarity=0.0, # Not meaningful for kit average
|
||||
transient_compatibility=0.0,
|
||||
spectral_balance=0.0,
|
||||
energy_consistency=0.0,
|
||||
is_professional=kit_score >= self.MIN_COHERENCE,
|
||||
weak_components=[f"Weak pair: {p['pair']} ({p['score']:.3f})" for p in weak_pairs[:3]],
|
||||
suggestions=suggestions
|
||||
)
|
||||
|
||||
if enforce_threshold and kit_score < self.MIN_COHERENCE:
|
||||
raise CoherenceError(kit_score, self.last_breakdown.weak_components, suggestions)
|
||||
|
||||
return kit_score
|
||||
|
||||
def score_section_transition(self, samples_a: List[str], samples_b: List[str],
|
||||
enforce_threshold: bool = True) -> float:
|
||||
"""
|
||||
Calculate coherence of transition between two sections.
|
||||
|
||||
Compares all samples in section A against all samples in section B
|
||||
to ensure smooth transition.
|
||||
|
||||
Args:
|
||||
samples_a: List of sample paths in first section
|
||||
samples_b: List of sample paths in second section
|
||||
enforce_threshold: If True, raises CoherenceError if score < 0.90
|
||||
|
||||
Returns:
|
||||
Transition coherence score (0.0-1.0)
|
||||
"""
|
||||
if not samples_a or not samples_b:
|
||||
raise ValueError("Both sections must contain at least one sample")
|
||||
|
||||
# Cross-section comparisons
|
||||
scores = []
|
||||
|
||||
for sample_a in samples_a:
|
||||
for sample_b in samples_b:
|
||||
try:
|
||||
score = self.score_pair(sample_a, sample_b, enforce_threshold=False)
|
||||
scores.append(score)
|
||||
except Exception as e:
|
||||
print(f"Warning: Cross-section comparison failed: {e}")
|
||||
|
||||
if not scores:
|
||||
raise ValueError("No valid cross-section comparisons")
|
||||
|
||||
transition_score = np.mean(scores)
|
||||
|
||||
# Analyze worst transitions
|
||||
if scores:
|
||||
min_score = min(scores)
|
||||
weak_count = sum(1 for s in scores if s < 0.75)
|
||||
else:
|
||||
min_score = 0.0
|
||||
weak_count = 0
|
||||
|
||||
suggestions = []
|
||||
if min_score < 0.70:
|
||||
suggestions.append(
|
||||
f"Poor transition detected (worst pair: {min_score:.3f}). "
|
||||
"Consider using transition FX or crossfade."
|
||||
)
|
||||
if weak_count > len(scores) * 0.3:
|
||||
suggestions.append(
|
||||
f"{weak_count}/{len(scores)} transitions are weak. "
|
||||
"Sections may be harmonically or sonically incompatible."
|
||||
)
|
||||
|
||||
self.last_breakdown = ScoreBreakdown(
|
||||
overall_score=transition_score,
|
||||
timbre_similarity=0.0,
|
||||
transient_compatibility=0.0,
|
||||
spectral_balance=0.0,
|
||||
energy_consistency=0.0,
|
||||
is_professional=transition_score >= self.MIN_COHERENCE,
|
||||
weak_components=[f"Weak transitions: {weak_count}"] if weak_count > 0 else [],
|
||||
suggestions=suggestions if suggestions else ["Transition coherence is acceptable"]
|
||||
)
|
||||
|
||||
if enforce_threshold and transition_score < self.MIN_COHERENCE:
|
||||
raise CoherenceError(transition_score, self.last_breakdown.weak_components, suggestions)
|
||||
|
||||
return transition_score
|
||||
|
||||
def get_score_breakdown(self) -> Dict:
|
||||
"""
|
||||
Get detailed breakdown of the last coherence calculation.
|
||||
|
||||
Returns:
|
||||
Dictionary with component scores and analysis
|
||||
"""
|
||||
if self.last_breakdown is None:
|
||||
return {
|
||||
'error': 'No coherence calculation performed yet. '
|
||||
'Call score_pair(), score_kit(), or score_section_transition() first.'
|
||||
}
|
||||
|
||||
return self.last_breakdown.to_dict()
|
||||
|
||||
@staticmethod
|
||||
def is_professional_grade(score: float) -> bool:
|
||||
"""
|
||||
Check if a coherence score meets professional standards.
|
||||
|
||||
Args:
|
||||
score: Coherence score to evaluate
|
||||
|
||||
Returns:
|
||||
True if score >= MIN_COHERENCE (0.90)
|
||||
"""
|
||||
return score >= CoherenceScorer.MIN_COHERENCE
|
||||
|
||||
def batch_score(self, sample_paths: List[str], mode: str = 'pairwise') -> Dict:
|
||||
"""
|
||||
Batch coherence analysis for multiple samples.
|
||||
|
||||
Args:
|
||||
sample_paths: List of sample paths to analyze
|
||||
mode: 'pairwise' for all pairs, 'kit' for overall coherence
|
||||
|
||||
Returns:
|
||||
Dictionary with scores and analysis
|
||||
"""
|
||||
if mode == 'pairwise':
|
||||
results = {
|
||||
'mode': 'pairwise',
|
||||
'pairs': [],
|
||||
'min_score': 1.0,
|
||||
'max_score': 0.0,
|
||||
'avg_score': 0.0
|
||||
}
|
||||
|
||||
scores = []
|
||||
for i in range(len(sample_paths)):
|
||||
for j in range(i + 1, len(sample_paths)):
|
||||
try:
|
||||
score = self.score_pair(
|
||||
sample_paths[i],
|
||||
sample_paths[j],
|
||||
enforce_threshold=False
|
||||
)
|
||||
scores.append(score)
|
||||
results['pairs'].append({
|
||||
'sample_a': Path(sample_paths[i]).name,
|
||||
'sample_b': Path(sample_paths[j]).name,
|
||||
'score': round(score, 4),
|
||||
'professional': score >= self.MIN_COHERENCE
|
||||
})
|
||||
except Exception as e:
|
||||
results['pairs'].append({
|
||||
'sample_a': Path(sample_paths[i]).name,
|
||||
'sample_b': Path(sample_paths[j]).name,
|
||||
'error': str(e)
|
||||
})
|
||||
|
||||
if scores:
|
||||
results['min_score'] = round(min(scores), 4)
|
||||
results['max_score'] = round(max(scores), 4)
|
||||
results['avg_score'] = round(np.mean(scores), 4)
|
||||
|
||||
return results
|
||||
|
||||
elif mode == 'kit':
|
||||
score = self.score_kit(sample_paths, enforce_threshold=False)
|
||||
return {
|
||||
'mode': 'kit',
|
||||
'kit_score': round(score, 4),
|
||||
'professional': score >= self.MIN_COHERENCE,
|
||||
'sample_count': len(sample_paths),
|
||||
'breakdown': self.get_score_breakdown()
|
||||
}
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown mode: {mode}. Use 'pairwise' or 'kit'")
|
||||
|
||||
|
||||
# Convenience functions for quick access
|
||||
def check_coherence(sample1: str, sample2: str) -> Dict:
|
||||
"""
|
||||
Quick coherence check between two samples.
|
||||
|
||||
Args:
|
||||
sample1: Path to first audio file
|
||||
sample2: Path to second audio file
|
||||
|
||||
Returns:
|
||||
Dictionary with score and breakdown
|
||||
"""
|
||||
scorer = CoherenceScorer()
|
||||
try:
|
||||
score = scorer.score_pair(sample1, sample2, enforce_threshold=False)
|
||||
return {
|
||||
'coherent': score >= CoherenceScorer.MIN_COHERENCE,
|
||||
'score': round(score, 4),
|
||||
'details': scorer.get_score_breakdown()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'coherent': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
|
||||
def check_kit_coherence(sample_paths: List[str]) -> Dict:
|
||||
"""
|
||||
Quick kit coherence check.
|
||||
|
||||
Args:
|
||||
sample_paths: List of sample paths
|
||||
|
||||
Returns:
|
||||
Dictionary with kit score and analysis
|
||||
"""
|
||||
scorer = CoherenceScorer()
|
||||
try:
|
||||
score = scorer.score_kit(sample_paths, enforce_threshold=False)
|
||||
return {
|
||||
'coherent': score >= CoherenceScorer.MIN_COHERENCE,
|
||||
'score': round(score, 4),
|
||||
'details': scorer.get_score_breakdown()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'coherent': False,
|
||||
'error': str(e)
|
||||
}
|
||||
Reference in New Issue
Block a user