CBCFacil v8.0 - Refactored with AMD GPU support

This commit is contained in:
2026-01-09 13:05:46 -03:00
parent cb17136f21
commit b017504c52
54 changed files with 7251 additions and 3670 deletions

355
core/health_check.py Normal file
View File

@@ -0,0 +1,355 @@
"""
Health check endpoint for CBCFacil service monitoring
"""
import json
import logging
from datetime import datetime
from typing import Dict, Any, List, Optional
from pathlib import Path
logger = logging.getLogger(__name__)
class HealthChecker:
"""Comprehensive health check for all service dependencies"""
def __init__(self):
self.logger = logging.getLogger(__name__)
def check_webdav_connection(self) -> Dict[str, Any]:
"""Check WebDAV service connectivity"""
from config import settings
result = {
"service": "webdav",
"status": "unknown",
"timestamp": datetime.utcnow().isoformat()
}
try:
from services.webdav_service import webdav_service
if not settings.has_webdav_config:
result["status"] = "not_configured"
result["message"] = "WebDAV credentials not configured"
return result
# Test connection with a simple list operation
webdav_service.list(".")
result["status"] = "healthy"
result["message"] = "WebDAV connection successful"
result["endpoint"] = settings.NEXTCLOUD_URL
except Exception as e:
result["status"] = "unhealthy"
result["error"] = str(e)
self.logger.error(f"WebDAV health check failed: {e}")
return result
def check_ai_providers(self) -> Dict[str, Any]:
"""Check AI provider configurations"""
from config import settings
result = {
"service": "ai_providers",
"status": "unknown",
"timestamp": datetime.utcnow().isoformat(),
"providers": {}
}
try:
# Check ZAI
if settings.ZAI_AUTH_TOKEN:
result["providers"]["zai"] = {
"configured": True,
"status": "unknown"
}
else:
result["providers"]["zai"] = {
"configured": False,
"status": "not_configured"
}
# Check Gemini
if settings.GEMINI_API_KEY:
result["providers"]["gemini"] = {
"configured": True,
"status": "unknown"
}
else:
result["providers"]["gemini"] = {
"configured": False,
"status": "not_configured"
}
# Check CLI providers
if settings.CLAUDE_CLI_PATH:
claude_path = Path(settings.CLAUDE_CLI_PATH)
result["providers"]["claude_cli"] = {
"configured": True,
"path_exists": claude_path.exists(),
"status": "available" if claude_path.exists() else "path_invalid"
}
if settings.GEMINI_CLI_PATH:
gemini_path = Path(settings.GEMINI_CLI_PATH)
result["providers"]["gemini_cli"] = {
"configured": True,
"path_exists": gemini_path.exists(),
"status": "available" if gemini_path.exists() else "path_invalid"
}
# Overall status
if settings.has_ai_config:
result["status"] = "healthy"
result["message"] = "At least one AI provider configured"
else:
result["status"] = "not_configured"
result["message"] = "No AI providers configured"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
self.logger.error(f"AI providers health check failed: {e}")
return result
def check_vram_manager(self) -> Dict[str, Any]:
"""Check VRAM manager status"""
result = {
"service": "vram_manager",
"status": "unknown",
"timestamp": datetime.utcnow().isoformat()
}
try:
from services.vram_manager import vram_manager
vram_info = vram_manager.get_vram_info()
result["status"] = "healthy"
result["vram_info"] = {
"total_gb": round(vram_info.get("total", 0) / (1024**3), 2),
"free_gb": round(vram_info.get("free", 0) / (1024**3), 2),
"allocated_gb": round(vram_info.get("allocated", 0) / (1024**3), 2)
}
result["cuda_available"] = vram_info.get("cuda_available", False)
except Exception as e:
result["status"] = "unavailable"
result["error"] = str(e)
self.logger.error(f"VRAM manager health check failed: {e}")
return result
def check_telegram_service(self) -> Dict[str, Any]:
"""Check Telegram service status"""
from config import settings
result = {
"service": "telegram",
"status": "unknown",
"timestamp": datetime.utcnow().isoformat()
}
try:
from services.telegram_service import telegram_service
if telegram_service.is_configured:
result["status"] = "healthy"
result["message"] = "Telegram service configured"
else:
result["status"] = "not_configured"
result["message"] = "Telegram credentials not configured"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
self.logger.error(f"Telegram service health check failed: {e}")
return result
def check_processed_registry(self) -> Dict[str, Any]:
"""Check processed files registry"""
result = {
"service": "processed_registry",
"status": "unknown",
"timestamp": datetime.utcnow().isoformat()
}
try:
from storage.processed_registry import processed_registry
# Try to load registry
processed_registry.load()
result["status"] = "healthy"
result["registry_path"] = str(processed_registry.registry_path)
# Check if registry file is writable
registry_file = Path(processed_registry.registry_path)
if registry_file.exists():
result["registry_exists"] = True
result["registry_writable"] = registry_file.is_file() and os.access(registry_file, os.W_OK)
else:
result["registry_exists"] = False
except Exception as e:
result["status"] = "unhealthy"
result["error"] = str(e)
self.logger.error(f"Processed registry health check failed: {e}")
return result
def check_disk_space(self) -> Dict[str, Any]:
"""Check available disk space"""
result = {
"service": "disk_space",
"status": "unknown",
"timestamp": datetime.utcnow().isoformat()
}
try:
import shutil
# Check main directory
usage = shutil.disk_usage(Path(__file__).parent.parent)
total_gb = usage.total / (1024**3)
free_gb = usage.free / (1024**3)
used_percent = (usage.used / usage.total) * 100
result["status"] = "healthy"
result["total_gb"] = round(total_gb, 2)
result["free_gb"] = round(free_gb, 2)
result["used_percent"] = round(used_percent, 2)
# Warning if low disk space
if free_gb < 1: # Less than 1GB
result["status"] = "warning"
result["message"] = "Low disk space"
elif free_gb < 5: # Less than 5GB
result["status"] = "degraded"
result["message"] = "Disk space running low"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
self.logger.error(f"Disk space health check failed: {e}")
return result
def check_configuration(self) -> Dict[str, Any]:
"""Check configuration validity"""
from config import settings
result = {
"service": "configuration",
"status": "unknown",
"timestamp": datetime.utcnow().isoformat()
}
try:
warnings = []
# Check for warnings
if not settings.has_webdav_config:
warnings.append("WebDAV not configured")
if not settings.has_ai_config:
warnings.append("AI providers not configured")
if not settings.telegram_configured:
warnings.append("Telegram not configured")
if settings.DASHBOARD_SECRET_KEY == "":
warnings.append("Dashboard secret key not set")
if settings.DASHBOARD_SECRET_KEY == "dashboard-secret-key-change-in-production":
warnings.append("Using default dashboard secret")
result["status"] = "healthy" if not warnings else "warning"
result["warnings"] = warnings
result["environment"] = settings.environment_type
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
self.logger.error(f"Configuration health check failed: {e}")
return result
def run_full_health_check(self) -> Dict[str, Any]:
"""Run all health checks and return comprehensive status"""
checks = [
("configuration", self.check_configuration),
("webdav", self.check_webdav_connection),
("ai_providers", self.check_ai_providers),
("vram_manager", self.check_vram_manager),
("telegram", self.check_telegram_service),
("processed_registry", self.check_processed_registry),
("disk_space", self.check_disk_space)
]
results = {}
overall_status = "healthy"
for check_name, check_func in checks:
try:
result = check_func()
results[check_name] = result
# Track overall status
if result["status"] in ["unhealthy", "error"]:
overall_status = "unhealthy"
elif result["status"] in ["warning", "degraded"] and overall_status == "healthy":
overall_status = "warning"
except Exception as e:
results[check_name] = {
"service": check_name,
"status": "error",
"error": str(e),
"timestamp": datetime.utcnow().isoformat()
}
overall_status = "unhealthy"
self.logger.error(f"Health check {check_name} failed: {e}")
return {
"overall_status": overall_status,
"timestamp": datetime.utcnow().isoformat(),
"checks": results,
"summary": {
"total_checks": len(checks),
"healthy": sum(1 for r in results.values() if r["status"] == "healthy"),
"warning": sum(1 for r in results.values() if r["status"] == "warning"),
"unhealthy": sum(1 for r in results.values() if r["status"] == "unhealthy")
}
}
# Convenience function for CLI usage
def get_health_status() -> Dict[str, Any]:
"""Get comprehensive health status"""
checker = HealthChecker()
return checker.run_full_health_check()
if __name__ == "__main__":
# CLI usage: python core/health_check.py
import sys
import os
health = get_health_status()
print(json.dumps(health, indent=2))
# Exit with appropriate code
if health["overall_status"] == "healthy":
sys.exit(0)
elif health["overall_status"] == "warning":
sys.exit(1)
else:
sys.exit(2)