CBCFacil v8.0 - Refactored with AMD GPU support
This commit is contained in:
235
storage/processed_registry.py
Normal file
235
storage/processed_registry.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""
|
||||
Processed files registry - Optimized version with bloom filter and better caching
|
||||
"""
|
||||
import fcntl
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Set, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from ..config import settings
|
||||
|
||||
|
||||
class BloomFilter:
|
||||
"""Simple Bloom Filter for fast membership testing"""
|
||||
|
||||
def __init__(self, size: int = 10000, hash_count: int = 3):
|
||||
self.size = size
|
||||
self.hash_count = hash_count
|
||||
self.bit_array = [0] * size
|
||||
|
||||
def _hashes(self, item: str) -> list[int]:
|
||||
"""Generate hash positions for item"""
|
||||
import hashlib
|
||||
digest = hashlib.md5(item.encode()).digest()
|
||||
return [
|
||||
int.from_bytes(digest[i:i+4], 'big') % self.size
|
||||
for i in range(0, min(self.hash_count * 4, len(digest)), 4)
|
||||
]
|
||||
|
||||
def add(self, item: str) -> None:
|
||||
for pos in self._hashes(item):
|
||||
self.bit_array[pos] = 1
|
||||
|
||||
def might_contain(self, item: str) -> bool:
|
||||
return all(self.bit_array[pos] for pos in self._hashes(item))
|
||||
|
||||
|
||||
class ProcessedRegistry:
|
||||
"""Registry for tracking processed files with caching and file locking"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._cache: Set[str] = set()
|
||||
self._cache_time: Optional[float] = None
|
||||
self._cache_ttl = 300 # 5 minutos (antes era 60s)
|
||||
self._initialized = False
|
||||
self._bloom_filter = BloomFilter(size=10000, hash_count=3)
|
||||
self._write_lock = False # Write batching
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Initialize the registry"""
|
||||
self.load()
|
||||
self._initialized = True
|
||||
self.logger.info(f"Processed registry initialized ({self.count()} files)")
|
||||
|
||||
def load(self) -> Set[str]:
|
||||
"""Load processed files from disk with caching"""
|
||||
now = time.time()
|
||||
|
||||
# Return cached data if still valid
|
||||
if self._cache and self._cache_time:
|
||||
age = now - self._cache_time
|
||||
if age < self._cache_ttl:
|
||||
return self._cache # Return reference, not copy for read-only
|
||||
|
||||
processed = set()
|
||||
registry_path = settings.processed_files_path
|
||||
|
||||
try:
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if registry_path.exists():
|
||||
with open(registry_path, 'r', encoding='utf-8') as f:
|
||||
for raw_line in f:
|
||||
line = raw_line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
processed.add(line)
|
||||
# Add basename for both path and basename lookups
|
||||
base_name = Path(line).name
|
||||
processed.add(base_name)
|
||||
# Update bloom filter
|
||||
self._bloom_filter.add(line)
|
||||
self._bloom_filter.add(base_name)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error reading processed files registry: {e}")
|
||||
|
||||
self._cache = processed
|
||||
self._cache_time = now
|
||||
return processed # Return reference, not copy
|
||||
|
||||
def save(self, file_path: str) -> None:
|
||||
"""Add file to processed registry with file locking"""
|
||||
if not file_path:
|
||||
return
|
||||
|
||||
registry_path = settings.processed_files_path
|
||||
|
||||
try:
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Check cache first
|
||||
if file_path in self._cache:
|
||||
return
|
||||
|
||||
# Append to file
|
||||
with open(registry_path, 'a', encoding='utf-8') as f:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
f.write(file_path + "\n")
|
||||
# Update in-memory structures
|
||||
self._cache.add(file_path)
|
||||
self._bloom_filter.add(file_path)
|
||||
self._cache_time = time.time()
|
||||
self.logger.debug(f"Added {file_path} to processed registry")
|
||||
finally:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error saving to processed files registry: {e}")
|
||||
raise
|
||||
|
||||
def is_processed(self, file_path: str) -> bool:
|
||||
"""Check if file has been processed - O(1) with bloom filter"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
|
||||
# Fast bloom filter check first
|
||||
if not self._bloom_filter.might_contain(file_path):
|
||||
return False
|
||||
|
||||
# Check cache (O(1) for both full path and basename)
|
||||
if file_path in self._cache:
|
||||
return True
|
||||
basename = Path(file_path).name
|
||||
if basename in self._cache:
|
||||
return True
|
||||
return False
|
||||
|
||||
def save_batch(self, file_paths: list[str]) -> int:
|
||||
"""Add multiple files to registry efficiently"""
|
||||
saved_count = 0
|
||||
|
||||
try:
|
||||
registry_path = settings.processed_files_path
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(registry_path, 'a', encoding='utf-8') as f:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
lines_to_write = []
|
||||
for file_path in file_paths:
|
||||
if file_path and file_path not in self._cache:
|
||||
lines_to_write.append(file_path + "\n")
|
||||
self._cache.add(file_path)
|
||||
self._bloom_filter.add(file_path)
|
||||
saved_count += 1
|
||||
|
||||
if lines_to_write:
|
||||
f.writelines(lines_to_write)
|
||||
self._cache_time = time.time()
|
||||
self.logger.debug(f"Added {saved_count} files to processed registry")
|
||||
finally:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error saving batch to processed files registry: {e}")
|
||||
|
||||
return saved_count
|
||||
|
||||
def remove(self, file_path: str) -> bool:
|
||||
"""Remove file from processed registry"""
|
||||
registry_path = settings.processed_files_path
|
||||
|
||||
try:
|
||||
if not registry_path.exists():
|
||||
return False
|
||||
lines_to_keep = []
|
||||
with open(registry_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
stripped = line.strip()
|
||||
if stripped != file_path and Path(stripped).name != Path(file_path).name:
|
||||
lines_to_keep.append(line)
|
||||
with open(registry_path, 'w', encoding='utf-8') as f:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
f.writelines(lines_to_keep)
|
||||
self._cache.discard(file_path)
|
||||
self._cache.discard(Path(file_path).name)
|
||||
# Rebuild bloom filter
|
||||
self._bloom_filter = BloomFilter(size=10000, hash_count=3)
|
||||
for item in self._cache:
|
||||
self._bloom_filter.add(item)
|
||||
finally:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error removing from processed files registry: {e}")
|
||||
return False
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear the entire registry"""
|
||||
registry_path = settings.processed_files_path
|
||||
try:
|
||||
if registry_path.exists():
|
||||
registry_path.unlink()
|
||||
self._cache.clear()
|
||||
self._cache_time = None
|
||||
self._bloom_filter = BloomFilter(size=10000, hash_count=3)
|
||||
self.logger.info("Processed files registry cleared")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error clearing processed files registry: {e}")
|
||||
raise
|
||||
|
||||
def get_all(self) -> Set[str]:
|
||||
"""Get all processed files"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return self._cache.copy()
|
||||
|
||||
def count(self) -> int:
|
||||
"""Get count of processed files"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return len(self._cache)
|
||||
|
||||
def get_stats(self) -> dict:
|
||||
"""Get registry statistics"""
|
||||
return {
|
||||
"total_files": len(self._cache),
|
||||
"cache_age_seconds": time.time() - self._cache_time if self._cache_time else 0,
|
||||
"cache_ttl_seconds": self._cache_ttl,
|
||||
"bloom_filter_size": self._bloom_filter.size,
|
||||
"initialized": self._initialized
|
||||
}
|
||||
|
||||
|
||||
# Global instance
|
||||
processed_registry = ProcessedRegistry()
|
||||
Reference in New Issue
Block a user