""" segment_rag_builder.py - Build or refresh the persistent segment-audio index. """ from __future__ import annotations import argparse import json import logging from pathlib import Path from reference_listener import ReferenceAudioListener, export_segment_rag_manifest, generate_segment_rag_summary, _get_segment_rag_status, _backfill_segment_cache_metadata logger = logging.getLogger(__name__) def _default_library_dir() -> Path: return Path(__file__).resolve().parents[2] / "librerias" / "organized_samples" def main() -> int: parser = argparse.ArgumentParser(description="Build the persistent segment-audio retrieval cache.") parser.add_argument("--library-dir", default=str(_default_library_dir()), help="Audio library directory") parser.add_argument("--roles", nargs="*", default=None, help="Subset of roles to index") parser.add_argument("--max-files", type=int, default=None, help="Optional limit for targeted files") parser.add_argument("--duration-limit", type=float, default=24.0, help="Max seconds per file during indexing") parser.add_argument("--force", action="store_true", help="Rebuild even if persistent segment cache already exists") parser.add_argument("--json", action="store_true", help="Emit full JSON report") parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output") parser.add_argument("--offset", type=int, default=0, help="Skip first N files before starting (for chunked indexing)") parser.add_argument("--batch-size", type=int, default=None, help="Process exactly N files then stop (for chunked indexing)") parser.add_argument("--output-manifest", type=str, default=None, help="Path to save full manifest JSON") parser.add_argument("--output-summary", type=str, default=None, help="Path to save summary report") parser.add_argument("--resume", action="store_true", help="Resume from previous run state") parser.add_argument("--export-manifest", type=str, default=None, help="Export candidate manifest to FILE (format: .json or .md)") parser.add_argument("--export-format", type=str, default="json", choices=['json', 'markdown'], help="Manifest export format") parser.add_argument("--status", action="store_true", help="Show current index status without building") parser.add_argument("--backfill-metadata", action="store_true", help="Backfill metadata into existing cache files from indexing state") parser.add_argument("--force-backfill", action="store_true", help="Force backfill even for files that already have metadata") args = parser.parse_args() # Configure logging based on verbose flag if args.verbose: logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s') else: logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') # Handle --status flag for early exit if args.status: status = _get_segment_rag_status(Path(args.library_dir)) if args.json: print(json.dumps(status, indent=2, default=str)) else: print("=" * 60) print("SEGMENT RAG INDEX STATUS") print("=" * 60) print(f"Cache Directory: {status['cache_dir']}") print(f"Cache Files: {status['cache_files']}") print(f"Total Indexed Segments: {status['total_segments']}") print(f"Status: {status.get('status', 'unknown')}") if status.get('role_coverage'): print("\nRole Coverage:") for role, count in sorted(status['role_coverage'].items()): print(f" {role}: {count} segments") if status.get('newest_entries'): print(f"\nNewest Entries: {len(status['newest_entries'])} files") for entry in status['newest_entries'][:5]: print(f" - {entry['file_name']} ({entry['segments']} segments)") if status.get('oldest_entries'): print(f"\nOldest Entries: {len(status['oldest_entries'])} files") for entry in status['oldest_entries'][:5]: print(f" - {entry['file_name']} ({entry['segments']} segments)") return 0 # Handle --backfill-metadata flag for early exit if args.backfill_metadata: result = _backfill_segment_cache_metadata(Path(args.library_dir), force=args.force_backfill) if args.json: print(json.dumps(result, indent=2, default=str)) else: print("=" * 60) print("SEGMENT CACHE METADATA BACKFILL") print("=" * 60) print(f"Cache Directory: {result['cache_dir']}") print(f"Cache Files: {result['cache_files']}") print(f"Backfilled: {result['backfilled']}") print(f"Skipped: {result['skipped']}") print(f"Errors: {result['errors']}") print(f"Status: {result.get('status', 'unknown')}") return 0 listener = ReferenceAudioListener(args.library_dir) report = listener.build_segment_rag_index( roles=args.roles, max_files=args.max_files, duration_limit=args.duration_limit, force=args.force, offset=args.offset, batch_size=args.batch_size, resume=args.resume, ) # Generate enhanced summary summary = generate_segment_rag_summary(report, Path(args.library_dir)) if args.json: print(json.dumps(summary, indent=2, default=str)) else: # Enhanced text output print("=" * 60) print("SEGMENT RAG INDEX COMPLETE") print("=" * 60) print(f"Device: {summary['device']}") print(f"Cache: {summary['segment_index_dir']}") print() print(f"Files: {summary['files_targeted']} targeted") print(f" Built: {summary['built']}") print(f" Reused: {summary['reused']}") print(f" Skipped: {summary['skipped']}") print(f" Errors: {summary['errors']}") print() print(f"Total Segments: {summary['total_segments']}") if 'summary_stats' in summary: stats = summary['summary_stats'] print(f" Avg per file: {stats['avg_segments_per_file']:.1f}") print(f" Range: {stats['min_segments']} - {stats['max_segments']}") if 'role_coverage' in summary: print("\nRole Coverage:") for role in sorted(summary['role_coverage'].keys()): print(f" {role}: {summary['role_coverage'][role]} segments") if 'cache_info' in summary: info = summary['cache_info'] print(f"\nCache Size: {info['cache_size_mb']} MB") if args.offset > 0: print(f"\nOffset: {args.offset}") if args.batch_size is not None: print(f"Batch Size: {args.batch_size}") print(f"Files Remaining: {summary.get('files_remaining', 'unknown')}") # Save manifest if requested if args.output_manifest: manifest_path = Path(args.output_manifest) manifest_path.parent.mkdir(parents=True, exist_ok=True) with open(manifest_path, 'w') as f: json.dump({ "report": report, "full_manifest": report.get("manifest", []), }, f, indent=2) if not args.json: print(f"\nManifest saved to: {manifest_path}") # Save summary if requested if args.output_summary: summary_path = Path(args.output_summary) summary_path.parent.mkdir(parents=True, exist_ok=True) with open(summary_path, 'w') as f: json.dump(summary, f, indent=2, default=str) if not args.json: print(f"Summary saved to: {summary_path}") # Export manifest in requested format if args.export_manifest: manifest_path = Path(args.export_manifest) export_format = args.export_format # Determine format from extension if not specified if not args.export_format or args.export_format == "json": if manifest_path.suffix == '.md': export_format = 'markdown' else: export_format = 'json' export_segment_rag_manifest( report.get('manifest', []), manifest_path, format=export_format ) print(f"Manifest exported to: {manifest_path}") return 0 if __name__ == "__main__": raise SystemExit(main())