Files
ableton-mcp-ai/AbletonMCP_AI/MCP_Server/segment_rag_builder.py
renato97 b92887836f Fix: Actualizar rutas de all_tracks a organized_samples
- segment_rag_builder.py: cambia default library a organized_samples
- scan_audio.py: actualiza path a organized_samples
- rebuild_index.py: script utilitario para reconstruir embeddings

Refs: transicion a organized_samples con bucket sampling
2026-03-29 01:50:28 -03:00

199 lines
8.3 KiB
Python

"""
segment_rag_builder.py - Build or refresh the persistent segment-audio index.
"""
from __future__ import annotations
import argparse
import json
import logging
from pathlib import Path
from reference_listener import ReferenceAudioListener, export_segment_rag_manifest, generate_segment_rag_summary, _get_segment_rag_status, _backfill_segment_cache_metadata
logger = logging.getLogger(__name__)
def _default_library_dir() -> Path:
return Path(__file__).resolve().parents[2] / "librerias" / "organized_samples"
def main() -> int:
parser = argparse.ArgumentParser(description="Build the persistent segment-audio retrieval cache.")
parser.add_argument("--library-dir", default=str(_default_library_dir()), help="Audio library directory")
parser.add_argument("--roles", nargs="*", default=None, help="Subset of roles to index")
parser.add_argument("--max-files", type=int, default=None, help="Optional limit for targeted files")
parser.add_argument("--duration-limit", type=float, default=24.0, help="Max seconds per file during indexing")
parser.add_argument("--force", action="store_true", help="Rebuild even if persistent segment cache already exists")
parser.add_argument("--json", action="store_true", help="Emit full JSON report")
parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output")
parser.add_argument("--offset", type=int, default=0, help="Skip first N files before starting (for chunked indexing)")
parser.add_argument("--batch-size", type=int, default=None, help="Process exactly N files then stop (for chunked indexing)")
parser.add_argument("--output-manifest", type=str, default=None, help="Path to save full manifest JSON")
parser.add_argument("--output-summary", type=str, default=None, help="Path to save summary report")
parser.add_argument("--resume", action="store_true", help="Resume from previous run state")
parser.add_argument("--export-manifest", type=str, default=None,
help="Export candidate manifest to FILE (format: .json or .md)")
parser.add_argument("--export-format", type=str, default="json",
choices=['json', 'markdown'], help="Manifest export format")
parser.add_argument("--status", action="store_true", help="Show current index status without building")
parser.add_argument("--backfill-metadata", action="store_true", help="Backfill metadata into existing cache files from indexing state")
parser.add_argument("--force-backfill", action="store_true", help="Force backfill even for files that already have metadata")
args = parser.parse_args()
# Configure logging based on verbose flag
if args.verbose:
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')
else:
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
# Handle --status flag for early exit
if args.status:
status = _get_segment_rag_status(Path(args.library_dir))
if args.json:
print(json.dumps(status, indent=2, default=str))
else:
print("=" * 60)
print("SEGMENT RAG INDEX STATUS")
print("=" * 60)
print(f"Cache Directory: {status['cache_dir']}")
print(f"Cache Files: {status['cache_files']}")
print(f"Total Indexed Segments: {status['total_segments']}")
print(f"Status: {status.get('status', 'unknown')}")
if status.get('role_coverage'):
print("\nRole Coverage:")
for role, count in sorted(status['role_coverage'].items()):
print(f" {role}: {count} segments")
if status.get('newest_entries'):
print(f"\nNewest Entries: {len(status['newest_entries'])} files")
for entry in status['newest_entries'][:5]:
print(f" - {entry['file_name']} ({entry['segments']} segments)")
if status.get('oldest_entries'):
print(f"\nOldest Entries: {len(status['oldest_entries'])} files")
for entry in status['oldest_entries'][:5]:
print(f" - {entry['file_name']} ({entry['segments']} segments)")
return 0
# Handle --backfill-metadata flag for early exit
if args.backfill_metadata:
result = _backfill_segment_cache_metadata(Path(args.library_dir), force=args.force_backfill)
if args.json:
print(json.dumps(result, indent=2, default=str))
else:
print("=" * 60)
print("SEGMENT CACHE METADATA BACKFILL")
print("=" * 60)
print(f"Cache Directory: {result['cache_dir']}")
print(f"Cache Files: {result['cache_files']}")
print(f"Backfilled: {result['backfilled']}")
print(f"Skipped: {result['skipped']}")
print(f"Errors: {result['errors']}")
print(f"Status: {result.get('status', 'unknown')}")
return 0
listener = ReferenceAudioListener(args.library_dir)
report = listener.build_segment_rag_index(
roles=args.roles,
max_files=args.max_files,
duration_limit=args.duration_limit,
force=args.force,
offset=args.offset,
batch_size=args.batch_size,
resume=args.resume,
)
# Generate enhanced summary
summary = generate_segment_rag_summary(report, Path(args.library_dir))
if args.json:
print(json.dumps(summary, indent=2, default=str))
else:
# Enhanced text output
print("=" * 60)
print("SEGMENT RAG INDEX COMPLETE")
print("=" * 60)
print(f"Device: {summary['device']}")
print(f"Cache: {summary['segment_index_dir']}")
print()
print(f"Files: {summary['files_targeted']} targeted")
print(f" Built: {summary['built']}")
print(f" Reused: {summary['reused']}")
print(f" Skipped: {summary['skipped']}")
print(f" Errors: {summary['errors']}")
print()
print(f"Total Segments: {summary['total_segments']}")
if 'summary_stats' in summary:
stats = summary['summary_stats']
print(f" Avg per file: {stats['avg_segments_per_file']:.1f}")
print(f" Range: {stats['min_segments']} - {stats['max_segments']}")
if 'role_coverage' in summary:
print("\nRole Coverage:")
for role in sorted(summary['role_coverage'].keys()):
print(f" {role}: {summary['role_coverage'][role]} segments")
if 'cache_info' in summary:
info = summary['cache_info']
print(f"\nCache Size: {info['cache_size_mb']} MB")
if args.offset > 0:
print(f"\nOffset: {args.offset}")
if args.batch_size is not None:
print(f"Batch Size: {args.batch_size}")
print(f"Files Remaining: {summary.get('files_remaining', 'unknown')}")
# Save manifest if requested
if args.output_manifest:
manifest_path = Path(args.output_manifest)
manifest_path.parent.mkdir(parents=True, exist_ok=True)
with open(manifest_path, 'w') as f:
json.dump({
"report": report,
"full_manifest": report.get("manifest", []),
}, f, indent=2)
if not args.json:
print(f"\nManifest saved to: {manifest_path}")
# Save summary if requested
if args.output_summary:
summary_path = Path(args.output_summary)
summary_path.parent.mkdir(parents=True, exist_ok=True)
with open(summary_path, 'w') as f:
json.dump(summary, f, indent=2, default=str)
if not args.json:
print(f"Summary saved to: {summary_path}")
# Export manifest in requested format
if args.export_manifest:
manifest_path = Path(args.export_manifest)
export_format = args.export_format
# Determine format from extension if not specified
if not args.export_format or args.export_format == "json":
if manifest_path.suffix == '.md':
export_format = 'markdown'
else:
export_format = 'json'
export_segment_rag_manifest(
report.get('manifest', []),
manifest_path,
format=export_format
)
print(f"Manifest exported to: {manifest_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())