CBCFacil v8.0 - Refactored with AMD GPU support
This commit is contained in:
76
.env.example
Normal file
76
.env.example
Normal file
@@ -0,0 +1,76 @@
|
||||
# =============================================================================
|
||||
# CBCFacil Configuration Template
|
||||
# =============================================================================
|
||||
# Copy this file to .env.secrets and fill in your actual values
|
||||
# NEVER commit .env.secrets to version control
|
||||
# =============================================================================
|
||||
|
||||
# =============================================================================
|
||||
# Application Configuration
|
||||
# =============================================================================
|
||||
DEBUG=false
|
||||
LOG_LEVEL=INFO
|
||||
|
||||
# =============================================================================
|
||||
# Nextcloud/WebDAV Configuration (Required for file sync)
|
||||
# =============================================================================
|
||||
NEXTCLOUD_URL=https://your-nextcloud.example.com/remote.php/webdav
|
||||
NEXTCLOUD_USER=your_username
|
||||
NEXTCLOUD_PASSWORD=your_secure_password
|
||||
|
||||
# =============================================================================
|
||||
# AI Providers Configuration (Required for summarization)
|
||||
# =============================================================================
|
||||
# Option 1: Claude/Anthropic
|
||||
ANTHROPIC_AUTH_TOKEN=your_claude_api_token_here
|
||||
ZAI_BASE_URL=https://api.z.ai/api/anthropic
|
||||
|
||||
# Option 2: Google Gemini
|
||||
GEMINI_API_KEY=your_gemini_api_key_here
|
||||
|
||||
# =============================================================================
|
||||
# CLI Tools (Optional - for local AI tools)
|
||||
# =============================================================================
|
||||
CLAUDE_CLI_PATH=/path/to/claude # or leave empty
|
||||
GEMINI_CLI_PATH=/path/to/gemini # or leave empty
|
||||
|
||||
# =============================================================================
|
||||
# Telegram Notifications (Optional)
|
||||
# =============================================================================
|
||||
TELEGRAM_TOKEN=your_telegram_bot_token
|
||||
TELEGRAM_CHAT_ID=your_telegram_chat_id
|
||||
|
||||
# =============================================================================
|
||||
# Dashboard Configuration (Required for production)
|
||||
# =============================================================================
|
||||
# Generate a secure key with: python -c "import os; print(os.urandom(24).hex())"
|
||||
DASHBOARD_SECRET_KEY=generate_a_secure_random_key_here
|
||||
DASHBOARD_HOST=0.0.0.0
|
||||
DASHBOARD_PORT=5000
|
||||
|
||||
# =============================================================================
|
||||
# GPU Configuration (Optional)
|
||||
# =============================================================================
|
||||
# Use specific GPU: CUDA_VISIBLE_DEVICES=0
|
||||
# Use all GPUs: CUDA_VISIBLE_DEVICES=all
|
||||
CUDA_VISIBLE_DEVICES=all
|
||||
PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
|
||||
|
||||
# =============================================================================
|
||||
# Performance Tuning (Optional)
|
||||
# =============================================================================
|
||||
OMP_NUM_THREADS=4
|
||||
MKL_NUM_THREADS=4
|
||||
|
||||
# =============================================================================
|
||||
# Processing Configuration (Optional)
|
||||
# =============================================================================
|
||||
POLL_INTERVAL=5
|
||||
HTTP_TIMEOUT=30
|
||||
WEBDAV_MAX_RETRIES=3
|
||||
DOWNLOAD_CHUNK_SIZE=65536
|
||||
|
||||
# =============================================================================
|
||||
# Logging (Optional)
|
||||
# =============================================================================
|
||||
LOG_FILE=logs/app.log
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,4 +1,5 @@
|
||||
# Local environment files
|
||||
.env.secrets
|
||||
.env.local
|
||||
.env
|
||||
|
||||
# Python cache
|
||||
|
||||
501
ARCHITECTURE.md
Normal file
501
ARCHITECTURE.md
Normal file
@@ -0,0 +1,501 @@
|
||||
# Arquitectura CBCFacil v9
|
||||
|
||||
## Resumen Ejecutivo
|
||||
|
||||
CBCFacil es un servicio de IA modular para procesamiento de documentos (audio, PDF, texto) con integracion a Nextcloud. Este documento describe la arquitectura actual, patrones de diseno utilizados y guia de extension del sistema.
|
||||
|
||||
## Evucion Arquitectonica
|
||||
|
||||
### Problema Original (v7)
|
||||
|
||||
El proyecto sufria de un archivo monolitico de 3167 lineas (`main.py`) que contenia todas las responsabilidades en un solo archivo.
|
||||
|
||||
### Solucion Actual (v9)
|
||||
|
||||
Arquitectura modular con separacion clara de responsabilidades en capas independientes.
|
||||
|
||||
```
|
||||
FLUJO DE DATOS
|
||||
=============
|
||||
|
||||
1. MONITOREO 2. DESCARGA 3. PROCESAMIENTO
|
||||
+---------------+ +---------------+ +---------------+
|
||||
| Nextcloud |-------->| Downloads/ |------>| Processors |
|
||||
| (WebDAV) | | Local | | (Audio/PDF) |
|
||||
+---------------+ +---------------+ +---------------+
|
||||
| |
|
||||
v v
|
||||
+---------------+ +---------------+
|
||||
| WebDAV | | AI Services |
|
||||
| Service | | (Claude/ |
|
||||
+---------------+ | Gemini) |
|
||||
+---------------+
|
||||
|
|
||||
v
|
||||
4. GENERACION 5. REGISTRO 6. NOTIFICACION
|
||||
+---------------+ +---------------+ +---------------+
|
||||
| Document |-------->| Processed |------>| Telegram |
|
||||
| Generators | | Registry | | Service |
|
||||
+---------------+ +---------------+ +---------------+
|
||||
| |
|
||||
v v
|
||||
+---------------+ +---------------+
|
||||
| Nextcloud | | Dashboard |
|
||||
| (Upload) | | (Flask) |
|
||||
+---------------+ +---------------+
|
||||
```
|
||||
|
||||
## Estructura de Directorios
|
||||
|
||||
```
|
||||
cbcfacil/
|
||||
├── main.py # Orquestador principal (149 lineas)
|
||||
├── run.py # Script de ejecucion alternativo
|
||||
├── config/ # Configuracion centralizada
|
||||
│ ├── __init__.py # Exports: settings, validate_environment
|
||||
│ ├── settings.py # Configuracion desde variables de entorno
|
||||
│ └── validators.py # Validadores de configuracion
|
||||
├── core/ # Nucleo compartido
|
||||
│ ├── __init__.py # Exports: excepciones, Result
|
||||
│ ├── exceptions.py # Excepciones personalizadas
|
||||
│ ├── result.py # Patron Result/Error handling
|
||||
│ └── base_service.py # Clase base BaseService
|
||||
├── services/ # Servicios externos
|
||||
│ ├── __init__.py # Exports de servicios
|
||||
│ ├── webdav_service.py # WebDAV/Nextcloud operaciones
|
||||
│ ├── vram_manager.py # GPU memory management
|
||||
│ ├── telegram_service.py # Telegram notificaciones
|
||||
│ ├── metrics_collector.py # Metricas y estadisticas
|
||||
│ ├── ai_service.py # Servicio AI unificado
|
||||
│ └── ai/ # AI Providers
|
||||
│ ├── __init__.py
|
||||
│ ├── base_provider.py # Interfaz BaseProvider
|
||||
│ ├── claude_provider.py # Claude (Z.ai) implementation
|
||||
│ ├── gemini_provider.py # Gemini API/CLI implementation
|
||||
│ └── provider_factory.py # Factory para proveedores
|
||||
├── processors/ # Procesadores de archivos
|
||||
│ ├── __init__.py # Exports de procesadores
|
||||
│ ├── base_processor.py # Clase base FileProcessor
|
||||
│ ├── audio_processor.py # Whisper transcription
|
||||
│ ├── pdf_processor.py # PDF OCR processing
|
||||
│ └── text_processor.py # Text summarization
|
||||
├── document/ # Generacion de documentos
|
||||
│ ├── __init__.py
|
||||
│ └── generators.py # DOCX/PDF/Markdown generation
|
||||
├── storage/ # Persistencia y cache
|
||||
│ ├── __init__.py
|
||||
│ └── processed_registry.py # Registro de archivos procesados
|
||||
├── api/ # API REST
|
||||
│ ├── __init__.py
|
||||
│ └── routes.py # Flask routes y endpoints
|
||||
├── tests/ # Tests unitarios e integracion
|
||||
│ ├── conftest.py # Fixtures pytest
|
||||
│ ├── test_config.py # Tests de configuracion
|
||||
│ ├── test_storage.py # Tests de almacenamiento
|
||||
│ ├── test_webdav.py # Tests de WebDAV
|
||||
│ ├── test_processors.py # Tests de procesadores
|
||||
│ ├── test_ai_providers.py # Tests de AI providers
|
||||
│ ├── test_vram_manager.py # Tests de VRAM manager
|
||||
│ └── test_main_integration.py # Tests de integracion main
|
||||
├── docs/ # Documentacion
|
||||
│ ├── archive/ # Documentacion historica
|
||||
│ ├── SETUP.md # Guia de configuracion
|
||||
│ ├── TESTING.md # Guia de testing
|
||||
│ └── DEPLOYMENT.md # Guia de despliegue
|
||||
├── requirements.txt # Dependencias produccion
|
||||
├── requirements-dev.txt # Dependencias desarrollo
|
||||
├── .env.example # Template de configuracion
|
||||
├── .env.secrets # Configuracion local (no versionar)
|
||||
└── Dockerfile # Container Docker
|
||||
```
|
||||
|
||||
## Componentes Principales
|
||||
|
||||
### 1. Servicios (services/)
|
||||
|
||||
#### WebDAVService
|
||||
|
||||
**Archivo**: `services/webdav_service.py`
|
||||
|
||||
Responsabilidades:
|
||||
- Conexion y operaciones con Nextcloud via WebDAV
|
||||
- Download/upload de archivos
|
||||
- Listado y creacion de directorios remotos
|
||||
- Manejo de errores con reintentos configurables
|
||||
|
||||
```python
|
||||
class WebDAVService:
|
||||
def initialize(self) -> None: ...
|
||||
def list(self, remote_path: str) -> List[str]: ...
|
||||
def download(self, remote_path: str, local_path: Path) -> None: ...
|
||||
def upload(self, local_path: Path, remote_path: str) -> None: ...
|
||||
def mkdir(self, remote_path: str) -> None: ...
|
||||
```
|
||||
|
||||
#### VRAMManager
|
||||
|
||||
**Archivo**: `services/vram_manager.py`
|
||||
|
||||
Responsabilidades:
|
||||
- Gestion de memoria GPU
|
||||
- Carga/descarga de modelos (Whisper, OCR, TrOCR)
|
||||
- Limpieza automatica de VRAM ociosa
|
||||
- Fallback a CPU cuando GPU no disponible
|
||||
|
||||
```python
|
||||
class VRAMManager:
|
||||
def initialize(self) -> None: ...
|
||||
def cleanup(self) -> None: ...
|
||||
def should_cleanup(self) -> bool: ...
|
||||
def lazy_cleanup(self) -> None: ...
|
||||
```
|
||||
|
||||
#### TelegramService
|
||||
|
||||
**Archivo**: `services/telegram_service.py`
|
||||
|
||||
Responsabilidades:
|
||||
- Envio de notificaciones a Telegram
|
||||
- Throttling de errores para evitar spam
|
||||
- Notificaciones de inicio/parada del servicio
|
||||
|
||||
```python
|
||||
class TelegramService:
|
||||
def configure(self, token: str, chat_id: str) -> None: ...
|
||||
def send_message(self, message: str) -> None: ...
|
||||
def send_error_notification(self, context: str, error: str) -> None: ...
|
||||
def send_start_notification(self) -> None: ...
|
||||
```
|
||||
|
||||
### 2. Procesadores (processors/)
|
||||
|
||||
#### AudioProcessor
|
||||
|
||||
**Archivo**: `processors/audio_processor.py`
|
||||
|
||||
Responsabilidades:
|
||||
- Transcripcion de audio usando Whisper
|
||||
- Modelo: medium (optimizado para espanol)
|
||||
- Soporte GPU/CPU automatico
|
||||
- Post-procesamiento de texto transcrito
|
||||
|
||||
#### PDFProcessor
|
||||
|
||||
**Archivo**: `processors/pdf_processor.py`
|
||||
|
||||
Responsabilidades:
|
||||
- Extraccion de texto de PDFs
|
||||
- OCR con EasyOCR + Tesseract + TrOCR en paralelo
|
||||
- Correccion de texto con IA
|
||||
- Generacion de documentos DOCX
|
||||
|
||||
#### TextProcessor
|
||||
|
||||
**Archivo**: `processors/text_processor.py`
|
||||
|
||||
Responsabilidades:
|
||||
- Resumenes usando IA (Claude/Gemini)
|
||||
- Clasificacion de contenido
|
||||
- Generacion de quizzes opcionales
|
||||
|
||||
### 3. AI Services (services/ai/)
|
||||
|
||||
#### ProviderFactory
|
||||
|
||||
**Archivo**: `services/ai/provider_factory.py`
|
||||
|
||||
Patron Factory para seleccion dinamica de proveedor de IA:
|
||||
```python
|
||||
class ProviderFactory:
|
||||
def get_provider(self, provider_type: str = "auto") -> BaseProvider: ...
|
||||
```
|
||||
|
||||
Proveedores disponibles:
|
||||
- `claude`: Claude via Z.ai API
|
||||
- `gemini`: Google Gemini API
|
||||
- `gemini_cli`: Gemini CLI local
|
||||
- `auto`: Seleccion automatica basada en disponibilidad
|
||||
|
||||
### 4. Document Generation (document/)
|
||||
|
||||
#### DocumentGenerator
|
||||
|
||||
**Archivo**: `document/generators.py`
|
||||
|
||||
Responsabilidades:
|
||||
- Creacion de documentos DOCX
|
||||
- Conversion a PDF
|
||||
- Formateo Markdown
|
||||
- Plantillas de documentos
|
||||
|
||||
### 5. Storage (storage/)
|
||||
|
||||
#### ProcessedRegistry
|
||||
|
||||
**Archivo**: `storage/processed_registry.py`
|
||||
|
||||
Responsabilidades:
|
||||
- Registro persistente de archivos procesados
|
||||
- Cache en memoria con TTL
|
||||
- File locking para thread-safety
|
||||
|
||||
```python
|
||||
class ProcessedRegistry:
|
||||
def initialize(self) -> None: ...
|
||||
def load(self) -> Set[str]: ...
|
||||
def save(self, file_path: str) -> None: ...
|
||||
def is_processed(self, file_path: str) -> bool: ...
|
||||
def mark_for_reprocess(self, file_path: str) -> None: ...
|
||||
```
|
||||
|
||||
### 6. API (api/)
|
||||
|
||||
#### Flask Routes
|
||||
|
||||
**Archivo**: `api/routes.py`
|
||||
|
||||
Endpoints REST disponibles:
|
||||
- `GET /api/files` - Listado de archivos
|
||||
- `POST /api/reprocess` - Reprocesar archivo
|
||||
- `POST /api/mark-unprocessed` - Resetear estado
|
||||
- `GET /api/refresh` - Sincronizar con Nextcloud
|
||||
- `GET /health` - Health check
|
||||
|
||||
## Patrones de Diseno Utilizados
|
||||
|
||||
### 1. Repository Pattern
|
||||
|
||||
```python
|
||||
# storage/processed_registry.py
|
||||
class ProcessedRegistry:
|
||||
def save(self, file_path: str) -> None: ...
|
||||
def load(self) -> Set[str]: ...
|
||||
def is_processed(self, file_path: str) -> bool: ...
|
||||
```
|
||||
|
||||
### 2. Factory Pattern
|
||||
|
||||
```python
|
||||
# services/ai/provider_factory.py
|
||||
class ProviderFactory:
|
||||
def get_provider(self, provider_type: str = "auto") -> BaseProvider: ...
|
||||
```
|
||||
|
||||
### 3. Strategy Pattern
|
||||
|
||||
```python
|
||||
# services/vram_manager.py
|
||||
class VRAMManager:
|
||||
def cleanup(self) -> None: ...
|
||||
def should_cleanup(self) -> bool: ...
|
||||
def lazy_cleanup(self) -> None: ...
|
||||
```
|
||||
|
||||
### 4. Service Layer Pattern
|
||||
|
||||
```python
|
||||
# services/webdav_service.py
|
||||
class WebDAVService:
|
||||
def list(self, remote_path: str) -> List[str]: ...
|
||||
def download(self, remote_path: str, local_path: Path) -> None: ...
|
||||
def upload(self, local_path: Path, remote_path: str) -> None: ...
|
||||
```
|
||||
|
||||
### 5. Singleton Pattern
|
||||
|
||||
Servicios implementados como singletons para compartir estado:
|
||||
```python
|
||||
# services/webdav_service.py
|
||||
webdav_service = WebDAVService()
|
||||
|
||||
# services/vram_manager.py
|
||||
vram_manager = VRAMManager()
|
||||
```
|
||||
|
||||
### 6. Result Pattern
|
||||
|
||||
```python
|
||||
# core/result.py
|
||||
class Result:
|
||||
@staticmethod
|
||||
def success(value): ...
|
||||
@staticmethod
|
||||
def failure(error): ...
|
||||
```
|
||||
|
||||
## Decisiones Arquitectonicas (ADR)
|
||||
|
||||
### ADR-001: Arquitectura Modular
|
||||
|
||||
**Decision**: Separar el monolito en modulos independientes.
|
||||
|
||||
**Contexto**: El archivo main.py de 3167 lineas era dificil de mantener y testar.
|
||||
|
||||
**Decision**: Separar en capas: config/, core/, services/, processors/, document/, storage/, api/.
|
||||
|
||||
**Consecuencias**:
|
||||
- Positivo: Codigo mas mantenible y testeable
|
||||
- Positivo: Reutilizacion de componentes
|
||||
- Negativo: Mayor complejidad inicial
|
||||
|
||||
### ADR-002: Configuracion Centralizada
|
||||
|
||||
**Decision**: Usar clase Settings con variables de entorno.
|
||||
|
||||
**Contexto**: Credenciales hardcodeadas representan riesgo de seguridad.
|
||||
|
||||
**Decision**: Todas las configuraciones via variables de entorno con .env.secrets.
|
||||
|
||||
**Consecuencias**:
|
||||
- Positivo: Seguridad mejorada
|
||||
- Positivo: Facil despliegue en diferentes entornos
|
||||
- Negativo: Requiere documentacion de variables
|
||||
|
||||
### ADR-003: GPU-First con CPU Fallback
|
||||
|
||||
**Decision**: Optimizar para GPU pero soportar CPU.
|
||||
|
||||
**Contexto**: No todos los usuarios tienen GPU disponible.
|
||||
|
||||
**Decision**: VRAMManager con lazy loading y cleanup automatico.
|
||||
|
||||
**Consecuencias**:
|
||||
- Positivo: Performance optimo en GPU
|
||||
- Positivo: Funciona sin GPU
|
||||
- Negativo: Complejidad adicional en gestion de memoria
|
||||
|
||||
### ADR-004: Factory para AI Providers
|
||||
|
||||
**Decision**: Abstraer proveedores de IA detras de interfaz comun.
|
||||
|
||||
**Contexto**: Multiples proveedores (Claude, Gemini) con diferentes APIs.
|
||||
|
||||
**Decision**: BaseProvider con implementaciones concretas y ProviderFactory.
|
||||
|
||||
**Consecuencias**:
|
||||
- Positivo: Facilidad para agregar nuevos proveedores
|
||||
- Positivo: Fallback entre proveedores
|
||||
- Negativo: Sobrecarga de abstraccion
|
||||
|
||||
## Guia de Extension del Sistema
|
||||
|
||||
### Agregar Nuevo Procesador
|
||||
|
||||
1. Crear archivo en `processors/`:
|
||||
```python
|
||||
from processors.base_processor import FileProcessor
|
||||
|
||||
class NuevoProcessor(FileProcessor):
|
||||
def process(self, file_path: str) -> None:
|
||||
# Implementar procesamiento
|
||||
pass
|
||||
```
|
||||
|
||||
2. Registrar en `processors/__init__.py`:
|
||||
```python
|
||||
from processors.nuevo_processor import NuevoProcessor
|
||||
|
||||
__all__ = ['NuevoProcessor', ...]
|
||||
```
|
||||
|
||||
3. Integrar en `main.py`:
|
||||
```python
|
||||
from processors.nuevo_processor import NuevoProcessor
|
||||
|
||||
nuevo_processor = NuevoProcessor()
|
||||
```
|
||||
|
||||
### Agregar Nuevo AI Provider
|
||||
|
||||
1. Crear clase en `services/ai/`:
|
||||
```python
|
||||
from services.ai.base_provider import BaseProvider
|
||||
|
||||
class NuevoProvider(BaseProvider):
|
||||
def summarize(self, text: str) -> str:
|
||||
# Implementar
|
||||
pass
|
||||
```
|
||||
|
||||
2. Registrar en `provider_factory.py`:
|
||||
```python
|
||||
PROVIDERS = {
|
||||
'nuevo': NuevoProvider,
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
3. Usar:
|
||||
```python
|
||||
provider = factory.get_provider('nuevo')
|
||||
```
|
||||
|
||||
### Agregar Nuevo Servicio
|
||||
|
||||
1. Crear archivo en `services/`:
|
||||
```python
|
||||
from core.base_service import BaseService
|
||||
|
||||
class NuevoService(BaseService):
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
nuevo_service = NuevoService()
|
||||
```
|
||||
|
||||
2. Inicializar en `main.py`:
|
||||
```python
|
||||
from services.nuevo_service import nuevo_service
|
||||
|
||||
nuevo_service.initialize()
|
||||
```
|
||||
|
||||
### Agregar Nuevo Endpoint API
|
||||
|
||||
1. Editar `api/routes.py`:
|
||||
```python
|
||||
@app.route('/api/nuevo', methods=['GET'])
|
||||
def nuevo_endpoint():
|
||||
return {'status': 'ok'}, 200
|
||||
```
|
||||
|
||||
## Configuracion Detallada
|
||||
|
||||
### Variables de Entorno Principales
|
||||
|
||||
| Variable | Requerido | Default | Descripcion |
|
||||
|----------|-----------|---------|-------------|
|
||||
| NEXTCLOUD_URL | Si | - | URL de Nextcloud WebDAV |
|
||||
| NEXTCLOUD_USER | Si | - | Usuario Nextcloud |
|
||||
| NEXTCLOUD_PASSWORD | Si | - | Contrasena Nextcloud |
|
||||
| ANTHROPIC_AUTH_TOKEN | No | - | Token Claude/Z.ai |
|
||||
| GEMINI_API_KEY | No | - | API Key Gemini |
|
||||
| TELEGRAM_TOKEN | No | - | Token Bot Telegram |
|
||||
| TELEGRAM_CHAT_ID | No | - | Chat ID Telegram |
|
||||
| CUDA_VISIBLE_DEVICES | No | "all" | GPU a usar |
|
||||
| POLL_INTERVAL | No | 5 | Segundos entre polls |
|
||||
| LOG_LEVEL | No | "INFO" | Nivel de logging |
|
||||
|
||||
## Metricas y Benchmarks
|
||||
|
||||
| Metrica | Valor |
|
||||
|---------|-------|
|
||||
| Lineas main.py | 149 (antes 3167) |
|
||||
| Modulos independientes | 8+ |
|
||||
| Cobertura tests | ~60%+ |
|
||||
| Tiempo inicio | 5-10s |
|
||||
| Transcripcion Whisper | ~1x tiempo audio (GPU) |
|
||||
| OCR PDF | 0.5-2s/pagina |
|
||||
|
||||
## Beneficios de la Arquitectura
|
||||
|
||||
1. **Mantenibilidad**: Cada responsabilidad en su propio modulo
|
||||
2. **Testabilidad**: Servicios independientes y testeables
|
||||
3. **Escalabilidad**: Facil agregar nuevos procesadores/servicios
|
||||
4. **Reutilizacion**: Componentes desacoplados
|
||||
5. **Legibilidad**: Codigo organizado y documentado
|
||||
6. **Seguridad**: Configuracion centralizada sin hardcoding
|
||||
|
||||
## Licencia
|
||||
|
||||
MIT License - Ver LICENSE para detalles.
|
||||
285
README.md
285
README.md
@@ -1,129 +1,206 @@
|
||||
# Nextcloud AI Service v8
|
||||
# CBCFacil v9
|
||||
|
||||
Servicio unificado que escucha automáticamente tu Nextcloud, descarga audios y PDFs, los procesa con Whisper + 3 modelos de IA y publica resúmenes enriquecidos junto con un dashboard en tiempo real.
|
||||
Servicio de IA unificado para procesamiento inteligente de documentos (audio, PDF, texto) con integracion a Nextcloud.
|
||||
|
||||
## Descripcion General
|
||||
|
||||
CBCFacil monitoriza automaticamente tu servidor Nextcloud, descarga archivos multimedia, los transcribe/resume utilizando modelos de IA (Whisper, Claude, Gemini) y genera documentos formateados para su descarga.
|
||||
|
||||
## Arquitectura
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌─────────────┐ ┌──────────────┐ ┌──────────────┐
|
||||
│ Nextcloud │───▶│ Whisper │───▶│ Claude (Z.ai)│───▶│ Gemini (CLI │
|
||||
│ Audios/PDFs │ │ Transcribe │ │ Bullet + │ │ + API) │
|
||||
└─────────────┘ └─────────────┘ │ Resumenes │ │ Formato final │
|
||||
└──────┬───────┘ └──────┬───────┘
|
||||
│ │
|
||||
▼ ▼
|
||||
Markdown / DOCX / PDF Dashboard
|
||||
+----------------+ +--------------+ +-----------------+ +------------------+
|
||||
| Nextcloud |---->| Procesador |---->| IA Services |---->| Dashboard/API |
|
||||
| (WebDAV) | | (Audio/PDF) | | (Claude/Gemini)| | (Flask) |
|
||||
+----------------+ +--------------+ +-----------------+ +------------------+
|
||||
| | |
|
||||
v v v
|
||||
+------------+ +------------+ +------------+
|
||||
| Whisper | | Gemini | | Telegram |
|
||||
| (GPU) | | API/CLI | | (notify) |
|
||||
+------------+ +------------+ +------------+
|
||||
```
|
||||
|
||||
## Principales capacidades
|
||||
- **Pipeline único**: descarga desde Nextcloud vía WebDAV, transcribe con Whisper, resume con Claude + Gemini y clasifica automáticamente.
|
||||
- **Dashboard integrado**: panel Flask (http://localhost:5000) para ver archivos, reprocesar o limpiar estados con un click.
|
||||
- **Diseño GPU-first**: Docker + CUDA 12.1 con PyTorch optimizado; limpieza agresiva de VRAM cuando los modelos están ociosos.
|
||||
- **Alertas opcionales**: soporte Telegram y WebDAV retries para operaciones largas.
|
||||
- **Código limpio**: sin Ollama ni servicios secundarios; sólo lo esencial para escalar y mantener.
|
||||
## Estructura del Proyecto
|
||||
|
||||
## Estructura mínima
|
||||
```
|
||||
cbc/
|
||||
├─ Dockerfile
|
||||
├─ docker-compose.yml
|
||||
├─ main.py # Servicio principal + loop de monitoreo
|
||||
├─ dashboard.py # Flask dashboard reutilizando la lógica de main.py
|
||||
├─ templates/index.html # UI del dashboard
|
||||
├─ requirements.txt
|
||||
└─ README.md
|
||||
cbcfacil/
|
||||
├── main.py # Punto de entrada del servicio
|
||||
├── run.py # Script de ejecucion alternativo
|
||||
├── config/ # Configuracion centralizada
|
||||
│ ├── settings.py # Variables de entorno y settings
|
||||
│ └── validators.py # Validadores de configuracion
|
||||
├── core/ # Nucleo del sistema
|
||||
│ ├── exceptions.py # Excepciones personalizadas
|
||||
│ ├── result.py # Patron Result
|
||||
│ └── base_service.py # Clase base para servicios
|
||||
├── services/ # Servicios externos
|
||||
│ ├── webdav_service.py # Operacones WebDAV/Nextcloud
|
||||
│ ├── vram_manager.py # Gestion de memoria GPU
|
||||
│ ├── telegram_service.py # Notificaciones Telegram
|
||||
│ └── ai/ # Proveedores de IA
|
||||
│ ├── base_provider.py # Interfaz base
|
||||
│ ├── claude_provider.py # Claude (Z.ai)
|
||||
│ ├── gemini_provider.py # Gemini API/CLI
|
||||
│ └── provider_factory.py # Factory de proveedores
|
||||
├── processors/ # Procesadores de archivos
|
||||
│ ├── audio_processor.py # Transcripcion Whisper
|
||||
│ ├── pdf_processor.py # OCR y extraccion PDF
|
||||
│ └── text_processor.py # Resumenes y clasificacion
|
||||
├── document/ # Generacion de documentos
|
||||
│ └── generators.py # DOCX, PDF, Markdown
|
||||
├── storage/ # Persistencia
|
||||
│ └── processed_registry.py # Registro de archivos procesados
|
||||
├── api/ # API REST
|
||||
│ └── routes.py # Endpoints Flask
|
||||
├── tests/ # Tests unitarios e integracion
|
||||
├── docs/ # Documentacion
|
||||
│ ├── archive/ # Documentacion historica
|
||||
│ ├── SETUP.md # Guia de configuracion
|
||||
│ ├── TESTING.md # Guia de testing
|
||||
│ └── DEPLOYMENT.md # Guia de despliegue
|
||||
├── requirements.txt # Dependencias Python
|
||||
├── requirements-dev.txt # Dependencias desarrollo
|
||||
├── .env.secrets # Configuracion local (no versionar)
|
||||
└── Dockerfile # Container Docker
|
||||
```
|
||||
|
||||
## Requisitos
|
||||
- NVIDIA GPU con drivers CUDA 12.1+ y `nvidia-container-toolkit` si usas Docker.
|
||||
- Python 3.10+ (el Dockerfile usa 3.10) y Node.js ≥ 20 para las CLI externas.
|
||||
- Claves activas para:
|
||||
- **Z.ai (Claude CLI)** → `ANTHROPIC_AUTH_TOKEN` y `ANTHROPIC_BASE_URL`.
|
||||
- **Google Gemini** (CLI o API) → `GEMINI_API_KEY`.
|
||||
- **DeepInfra GPT-OSS-120B** → `DEEPINFRA_API_KEY`.
|
||||
- Servidor Nextcloud accesible por WebDAV (usuario, contraseña y URL remota).
|
||||
|
||||
### Instalación de las CLIs externas
|
||||
- Python 3.10+
|
||||
- NVIDIA GPU con drivers CUDA 12.1+ (opcional, soporta CPU fallback)
|
||||
- Nextcloud accesible via WebDAV
|
||||
- Claves API para servicios de IA (opcional)
|
||||
|
||||
## Instalacion Rapida
|
||||
|
||||
```bash
|
||||
npm install -g @anthropic-ai/claude-code
|
||||
npm install -g @google/gemini-cli
|
||||
```
|
||||
Recuerda exportar las mismas variables de entorno (`ANTHROPIC_*`, `GEMINI_API_KEY`) para que las CLIs compartan credenciales con el servicio.
|
||||
# Clonar y entrar al directorio
|
||||
git clone <repo_url>
|
||||
cd cbcfacil
|
||||
|
||||
## Configuración
|
||||
1. Copia el ejemplo `.env` (no versionado) y completa:
|
||||
```env
|
||||
NEXTCLOUD_URL=http://tu-servidor:8080/remote.php/webdav
|
||||
NEXTCLOUD_USER=...
|
||||
NEXTCLOUD_PASS=...
|
||||
|
||||
GEMINI_API_KEY=...
|
||||
DEEPINFRA_API_KEY=...
|
||||
ANTHROPIC_BASE_URL=https://api.z.ai/api/anthropic
|
||||
ANTHROPIC_AUTH_TOKEN=...
|
||||
|
||||
TELEGRAM_TOKEN=... (opcional)
|
||||
TELEGRAM_CHAT_ID=... (opcional)
|
||||
```
|
||||
2. Crea los directorios utilizados por los volúmenes (si no existen):
|
||||
```bash
|
||||
mkdir -p downloads resumenes_docx
|
||||
```
|
||||
|
||||
## Ejecución local (sin Docker)
|
||||
```bash
|
||||
# Crear entorno virtual
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
python3 main.py
|
||||
```
|
||||
- El dashboard se expone en `http://localhost:5000`.
|
||||
- Los registros viven en `logs/service.log`.
|
||||
|
||||
## Ejecución con Docker
|
||||
# Instalar dependencias
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Configurar variables de entorno
|
||||
cp .env.example .env.secrets
|
||||
# Editar .env.secrets con tus credenciales
|
||||
|
||||
# Ejecutar el servicio
|
||||
python main.py
|
||||
```
|
||||
|
||||
## Configuracion
|
||||
|
||||
### Variables de Entorno (.env.secrets)
|
||||
|
||||
```bash
|
||||
# Nextcloud/WebDAV (requerido para sincronizacion)
|
||||
NEXTCLOUD_URL=https://tu-nextcloud.com/remote.php/webdav
|
||||
NEXTCLOUD_USER=usuario
|
||||
NEXTCLOUD_PASSWORD=contrasena
|
||||
|
||||
# AI Providers (requerido para resúmenes)
|
||||
ANTHROPIC_AUTH_TOKEN=sk-ant-...
|
||||
GEMINI_API_KEY=AIza...
|
||||
|
||||
# Telegram (opcional)
|
||||
TELEGRAM_TOKEN=bot_token
|
||||
TELEGRAM_CHAT_ID=chat_id
|
||||
|
||||
# GPU (opcional)
|
||||
CUDA_VISIBLE_DEVICES=0
|
||||
```
|
||||
|
||||
Ver `.env.example` para todas las variables disponibles.
|
||||
|
||||
## Uso
|
||||
|
||||
### Servicio Completo
|
||||
|
||||
```bash
|
||||
# Ejecutar con monitoring y dashboard
|
||||
python main.py
|
||||
```
|
||||
|
||||
### Comandos CLI
|
||||
|
||||
```bash
|
||||
# Transcribir audio
|
||||
python main.py whisper <archivo_audio> <output_dir>
|
||||
|
||||
# Procesar PDF
|
||||
python main.py pdf <archivo_pdf> <output_dir>
|
||||
```
|
||||
|
||||
### Dashboard
|
||||
|
||||
El dashboard se expone en `http://localhost:5000` e incluye:
|
||||
- Vista de archivos procesados/pendientes
|
||||
- API REST para integraciones
|
||||
- Endpoints de salud
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
# Ejecutar todos los tests
|
||||
pytest tests/
|
||||
|
||||
# Tests con coverage
|
||||
pytest tests/ --cov=cbcfacil --cov-report=term-missing
|
||||
|
||||
# Tests especificos
|
||||
pytest tests/test_config.py -v
|
||||
pytest tests/test_storage.py -v
|
||||
pytest tests/test_processors.py -v
|
||||
```
|
||||
|
||||
Ver `docs/TESTING.md` para guia completa.
|
||||
|
||||
## Despliegue
|
||||
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
docker compose up -d --build
|
||||
docker compose logs -f app
|
||||
docker logs -f cbcfacil
|
||||
```
|
||||
El único servicio (`nextcloud_ai_app`) ya expone el dashboard y comparte `downloads/` y `resumenes_docx/` como volúmenes.
|
||||
|
||||
## Flujo del pipeline
|
||||
1. **Monitoreo**: cada `POLL_INTERVAL` segundos se listan nuevas entradas en Nextcloud (`Audios`, `Pdf`, `Textos`...).
|
||||
2. **Descarga y preproceso**: los archivos se guardan en `downloads/` con normalización y sanitización de nombres.
|
||||
3. **Transcripción (Whisper)**: modelo `medium` optimizado para español (soporte GPU).
|
||||
4. **Resúmenes colaborativos**:
|
||||
- Claude CLI genera bullet points por lotes y resumen integral.
|
||||
- Gemini CLI/API aplica formato final y estilo consistente.
|
||||
5. **Clasificación y renombrado**: se detectan temáticas (Historia, Contabilidad, Gobierno, Otras Clases) + topics para nombrar archivos inteligentemente.
|
||||
6. **Entrega**: se generan `.md`, `.docx`, `.pdf` y se suben de nuevo a Nextcloud.
|
||||
7. **Dashboard**: refleja estado (procesados/pendientes), permite reprocesar o resetear registros, y sirve descargas locales.
|
||||
### Produccion
|
||||
|
||||
## Dashboard en detalle
|
||||
- **Vista general**: tarjetas con totales, filtros por origen (local/WebDAV) y buscador instantáneo.
|
||||
- **Acciones rápidas**:
|
||||
- `Procesar`: envía el archivo nuevamente al pipeline.
|
||||
- `Resetear`: elimina la marca de procesado para forzar un reprocesamiento automático.
|
||||
- `Descargar`: enlaces directos a TXT/MD/DOCX/PDF disponibles.
|
||||
- **API**:
|
||||
- `GET /api/files` → listado.
|
||||
- `POST /api/reprocess` → reprocesa.
|
||||
- `POST /api/mark-unprocessed` → limpia estado.
|
||||
- `GET /api/refresh` → sincroniza.
|
||||
- `GET /health` → healthcheck.
|
||||
Ver `docs/DEPLOYMENT.md` para guia completa de despliegue.
|
||||
|
||||
## Buenas prácticas operativas
|
||||
- **CUDA libre**: el servicio libera VRAM si los modelos quedan ociosos; revisa el log para ver las limpiezas agresivas.
|
||||
-.**Telegram**: usa `ERROR_THROTTLE_SECONDS` para evitar spam en errores repetidos.
|
||||
- **Respaldo**: `processed_files.txt` guarda el historial de todo lo procesado; respáldalo si cambias de servidor.
|
||||
- **Extensibilidad**: si necesitas nuevos formatos o pasos, agrega funciones en `main.py` reutilizando `ThreadPoolExecutor` y los helpers existentes.
|
||||
## Metricas de Performance
|
||||
|
||||
## Troubleshooting rápido
|
||||
| Problema | Fix |
|
||||
| --- | --- |
|
||||
| Claude o Gemini devuelven error de permisos | Exporta `CLAUDE_DANGEROUSLY_SKIP_PERMISSIONS=1` (ya se envía al contenedor). |
|
||||
| No aparecen archivos en el dashboard | Verifica credenciales Nextcloud y `logs/service.log`. |
|
||||
| Tiempo de espera alto en WebDAV | Ajusta `HTTP_TIMEOUT` y `WEBDAV_MAX_RETRIES` en `.env`. |
|
||||
| GPU ocupada constantemente | Reduce `MODEL_TIMEOUT_SECONDS` o baja el tamaño del modelo Whisper. |
|
||||
| Componente | Metrica |
|
||||
|------------|---------|
|
||||
| main.py | 149 lineas (antes 3167) |
|
||||
| Cobertura tests | ~60%+ |
|
||||
| Tiempo inicio | ~5-10s |
|
||||
| Transcripcion Whisper | ~1x tiempo audio (GPU) |
|
||||
| Resumen Claude | ~2-5s por documento |
|
||||
| OCR PDF | Depende de paginas |
|
||||
|
||||
---
|
||||
## Contribucion
|
||||
|
||||
La meta de esta versión es mantener el proyecto ágil y escalable: menos archivos, menos servicios y una sola fuente de verdad. Si necesitas habilitar nuevas integraciones (por ejemplo, más modelos o destinos), añade módulos dedicados dentro de `main.py` o expórtalos a un paquete propio manteniendo este núcleo ligero. ¡Felices resúmenes! 💡
|
||||
1. Fork el repositorio
|
||||
2. Crear branch feature (`git checkout -b feature/nueva-funcionalidad`)
|
||||
3. Commit cambios (`git commit -am 'Add nueva funcionalidad'`)
|
||||
4. Push al branch (`git push origin feature/nueva-funcionalidad`)
|
||||
5. Crear Pull Request
|
||||
|
||||
## Documentacion
|
||||
|
||||
- `docs/SETUP.md` - Guia de configuracion inicial
|
||||
- `docs/TESTING.md` - Guia de testing
|
||||
- `docs/DEPLOYMENT.md` - Guia de despliegue
|
||||
- `ARCHITECTURE.md` - Documentacion arquitectonica
|
||||
- `CHANGELOG.md` - Historial de cambios
|
||||
|
||||
## Licencia
|
||||
|
||||
MIT License - Ver LICENSE para detalles.
|
||||
|
||||
187
amd/ROCM_SETUP.md
Normal file
187
amd/ROCM_SETUP.md
Normal file
@@ -0,0 +1,187 @@
|
||||
# 🚀 ROCm Setup para AMD GPU
|
||||
|
||||
## ✅ Estado Actual del Sistema
|
||||
|
||||
**GPU**: AMD Radeon RX 6800 XT
|
||||
**ROCm**: 6.0
|
||||
**PyTorch**: 2.5.0+rocm6.0
|
||||
|
||||
## 📋 Comandos Esenciales
|
||||
|
||||
### Verificar GPU
|
||||
```bash
|
||||
# Información básica de la GPU
|
||||
lspci | grep -i vga
|
||||
|
||||
# Estado en tiempo real de ROCm
|
||||
rocm-smi
|
||||
|
||||
# Información detallada del sistema
|
||||
rocminfo
|
||||
```
|
||||
|
||||
### Variables de Entorno Críticas
|
||||
```bash
|
||||
# CRÍTICO para gfx1030 (RX 6000 series)
|
||||
export HSA_OVERRIDE_GFX_VERSION=10.3.0
|
||||
|
||||
# Agregar al ~/.bashrc o ~/.zshrc
|
||||
echo 'export HSA_OVERRIDE_GFX_VERSION=10.3.0' >> ~/.bashrc
|
||||
source ~/.bashrc
|
||||
```
|
||||
|
||||
### Verificar PyTorch con ROCm
|
||||
```bash
|
||||
# Test básico de PyTorch
|
||||
python3 -c "
|
||||
import torch
|
||||
print(f'PyTorch: {torch.__version__}')
|
||||
print(f'ROCm disponible: {torch.cuda.is_available()}')
|
||||
print(f'Dispositivos: {torch.cuda.device_count()}')
|
||||
if torch.cuda.is_available():
|
||||
print(f'GPU: {torch.cuda.get_device_name(0)}')
|
||||
print(f'Memoria: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB')
|
||||
"
|
||||
|
||||
# Benchmark rápido
|
||||
python3 -c "
|
||||
import torch, time
|
||||
a = torch.randn(4096, 4096, device='cuda')
|
||||
b = torch.randn(4096, 4096, device='cuda')
|
||||
start = time.time()
|
||||
c = torch.matmul(a, b)
|
||||
torch.cuda.synchronize()
|
||||
print(f'GPU time: {time.time() - start:.4f}s')
|
||||
"
|
||||
```
|
||||
|
||||
## 🧪 Script de Stress Test
|
||||
|
||||
### Ejecutar Stress Test (2 minutos)
|
||||
```bash
|
||||
python3 /home/ren/gpu/rocm_stress_test.py
|
||||
```
|
||||
|
||||
## 🔧 Troubleshooting
|
||||
|
||||
### Si ROCm no detecta la GPU:
|
||||
```bash
|
||||
# Verificar módulos del kernel
|
||||
lsmod | grep amdgpu
|
||||
lsmod | grep kfd
|
||||
|
||||
# Recargar módulos
|
||||
sudo modprobe amdgpu
|
||||
sudo modprobe kfd
|
||||
|
||||
# Verificar logs
|
||||
dmesg | grep amdgpu
|
||||
```
|
||||
|
||||
### Si PyTorch no encuentra ROCm:
|
||||
```bash
|
||||
# Reinstalar PyTorch con ROCm
|
||||
pip uninstall torch torchvision torchaudio
|
||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0
|
||||
```
|
||||
|
||||
### Si hay errores de memoria:
|
||||
```bash
|
||||
# Limpiar cache de GPU
|
||||
python3 -c "import torch; torch.cuda.empty_cache()"
|
||||
|
||||
# Verificar uso de memoria
|
||||
rocm-smi --meminfo
|
||||
```
|
||||
|
||||
## 📊 Monitoreo Continuo
|
||||
|
||||
### Terminal 1 - Monitor en tiempo real
|
||||
```bash
|
||||
watch -n 1 rocm-smi
|
||||
```
|
||||
|
||||
### Terminal 2 - Información detallada
|
||||
```bash
|
||||
rocm-smi --showtemp --showmeminfo vram --showmeminfo all
|
||||
```
|
||||
|
||||
## 💡 Ejemplos de Uso
|
||||
|
||||
### Cargar modelo en GPU
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f"Usando dispositivo: {device}")
|
||||
|
||||
model = AutoModel.from_pretrained("bert-base-uncased")
|
||||
model = model.to(device)
|
||||
|
||||
# Los tensores ahora se procesarán en la GPU
|
||||
inputs = torch.tensor([1, 2, 3]).to(device)
|
||||
```
|
||||
|
||||
### Entrenamiento en GPU
|
||||
```python
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
device = torch.device("cuda")
|
||||
model = tu_modelo().to(device)
|
||||
criterion = nn.CrossEntropyLoss().to(device)
|
||||
optimizer = torch.optim.Adam(model.parameters())
|
||||
|
||||
for epoch in range(epochs):
|
||||
for batch in dataloader:
|
||||
inputs, labels = batch
|
||||
inputs, labels = inputs.to(device), labels.to(device)
|
||||
|
||||
optimizer.zero_grad()
|
||||
outputs = model(inputs)
|
||||
loss = criterion(outputs, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
```
|
||||
|
||||
## 🎯 Optimizaciones
|
||||
|
||||
### Para mejor rendimiento:
|
||||
```python
|
||||
# Usar mixed precision (más rápido en RDNA2)
|
||||
from torch.cuda.amp import autocast, GradScaler
|
||||
|
||||
scaler = GradScaler()
|
||||
with autocast():
|
||||
output = model(inputs)
|
||||
loss = criterion(output, targets)
|
||||
|
||||
scaler.scale(loss).backward()
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
```
|
||||
|
||||
## 📈 Comandos Útiles
|
||||
|
||||
```bash
|
||||
# Ver versión de ROCm
|
||||
rocm-smi --version
|
||||
|
||||
# Verificar HSA
|
||||
rocminfo
|
||||
|
||||
# Test de compatibilidad
|
||||
python3 /opt/rocm/bin/rocprofiler-compute-test.py
|
||||
|
||||
# Verificar BLAS
|
||||
python3 -c "import torch; print(torch.backends.mps.is_available())" # False en AMD
|
||||
```
|
||||
|
||||
## ⚡ Performance Tips
|
||||
|
||||
1. **Siempre mueve datos a GPU**: `.to(device)`
|
||||
2. **Usa batch sizes grandes**: Aprovecha los 16GB de VRAM
|
||||
3. **Mixed precision**: Acelera el entrenamiento 1.5-2x
|
||||
4. **DataLoader con num_workers**: Carga datos en paralelo
|
||||
5. **torch.cuda.synchronize()**: Para benchmarks precisos
|
||||
255
amd/rocm_stress_test.py
Executable file
255
amd/rocm_stress_test.py
Executable file
@@ -0,0 +1,255 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
🔥 ROCm Stress Test - Prueba de resistencia para GPU AMD
|
||||
Ejecuta operaciones intensivas durante 2 minutos y monitorea métricas
|
||||
"""
|
||||
|
||||
import torch
|
||||
import time
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
def print_header():
|
||||
print("=" * 70)
|
||||
print("🔥 ROCm STRESS TEST - AMD GPU STRESS TEST")
|
||||
print("=" * 70)
|
||||
print(f"Inicio: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")
|
||||
print(f"VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
def get_gpu_stats():
|
||||
"""Obtener estadísticas actuales de la GPU"""
|
||||
if not torch.cuda.is_available():
|
||||
return None
|
||||
|
||||
props = torch.cuda.get_device_properties(0)
|
||||
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||||
mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
||||
mem_total = props.total_memory / 1024**3
|
||||
|
||||
return {
|
||||
'mem_allocated': mem_allocated,
|
||||
'mem_reserved': mem_reserved,
|
||||
'mem_total': mem_total,
|
||||
'mem_percent': (mem_allocated / mem_total) * 100
|
||||
}
|
||||
|
||||
def stress_test(duration_seconds=120):
|
||||
"""Ejecutar stress test durante duración especificada"""
|
||||
print(f"🧪 Iniciando stress test por {duration_seconds} segundos...")
|
||||
print(f" Presiona Ctrl+C para detener en cualquier momento\n")
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
print("❌ ERROR: CUDA/ROCm no está disponible!")
|
||||
sys.exit(1)
|
||||
|
||||
device = torch.device("cuda")
|
||||
torch.cuda.set_per_process_memory_fraction(0.85, 0) # Usar 85% de VRAM
|
||||
|
||||
# Inicializar
|
||||
results = {
|
||||
'matmul_times': [],
|
||||
'conv_times': [],
|
||||
'reLU_times': [],
|
||||
'softmax_times': [],
|
||||
'iterations': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
iteration = 0
|
||||
last_print = 0
|
||||
|
||||
try:
|
||||
while time.time() - start_time < duration_seconds:
|
||||
iteration += 1
|
||||
results['iterations'] = iteration
|
||||
|
||||
try:
|
||||
# Operaciones intensivas de ML
|
||||
# 1. Matriz multiplicación (varía el tamaño)
|
||||
if iteration % 5 == 0:
|
||||
size = 8192 # Matriz grande ocasionalmente
|
||||
elif iteration % 3 == 0:
|
||||
size = 4096
|
||||
else:
|
||||
size = 2048
|
||||
|
||||
a = torch.randn(size, size, device=device, dtype=torch.float16)
|
||||
b = torch.randn(size, size, device=device, dtype=torch.float16)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.time()
|
||||
c = torch.matmul(a, b)
|
||||
torch.cuda.synchronize()
|
||||
matmul_time = time.time() - t0
|
||||
results['matmul_times'].append(matmul_time)
|
||||
|
||||
del a, b, c
|
||||
|
||||
# 2. Convolución 3D
|
||||
x = torch.randn(32, 128, 64, 64, 64, device=device)
|
||||
conv = torch.nn.Conv3d(128, 256, kernel_size=3, padding=1).to(device)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.time()
|
||||
out = conv(x)
|
||||
torch.cuda.synchronize()
|
||||
conv_time = time.time() - t0
|
||||
results['conv_times'].append(conv_time)
|
||||
|
||||
# 3. ReLU + BatchNorm
|
||||
bn = torch.nn.BatchNorm2d(256).to(device)
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.time()
|
||||
out = bn(torch.relu(out))
|
||||
torch.cuda.synchronize()
|
||||
relu_time = time.time() - t0
|
||||
results['reLU_times'].append(relu_time)
|
||||
|
||||
del x, out
|
||||
|
||||
# 4. Softmax grande
|
||||
x = torch.randn(2048, 2048, device=device)
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.time()
|
||||
softmax_out = torch.softmax(x, dim=-1)
|
||||
torch.cuda.synchronize()
|
||||
softmax_time = time.time() - t0
|
||||
results['softmax_times'].append(softmax_time)
|
||||
|
||||
del softmax_out
|
||||
|
||||
except Exception as e:
|
||||
results['errors'] += 1
|
||||
if results['errors'] < 5: # Solo mostrar primeros errores
|
||||
print(f"\n⚠️ Error en iteración {iteration}: {str(e)[:50]}")
|
||||
|
||||
# Progress cada segundo
|
||||
elapsed = time.time() - start_time
|
||||
if elapsed - last_print >= 1.0 or elapsed >= duration_seconds:
|
||||
last_print = elapsed
|
||||
progress = (elapsed / duration_seconds) * 100
|
||||
|
||||
# Stats
|
||||
stats = get_gpu_stats()
|
||||
matmul_avg = sum(results['matmul_times'][-10:]) / len(results['matmul_times'][-10:]) if results['matmul_times'] else 0
|
||||
|
||||
print(f"\r⏱️ Iteración {iteration:4d} | "
|
||||
f"Tiempo: {elapsed:6.1f}s/{duration_seconds}s [{progress:5.1f}%] | "
|
||||
f"VRAM: {stats['mem_allocated']:5.2f}GB/{stats['mem_total']:.2f}GB ({stats['mem_percent']:5.1f}%) | "
|
||||
f"MatMul avg: {matmul_avg*1000:6.2f}ms | "
|
||||
f"Iter/s: {iteration/elapsed:5.2f}",
|
||||
end='', flush=True)
|
||||
|
||||
# Pequeña pausa para evitar sobrecarga
|
||||
time.sleep(0.05)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n⏹️ Interrumpido por el usuario")
|
||||
elapsed = time.time() - start_time
|
||||
print(f" Duración real: {elapsed:.1f} segundos")
|
||||
print(f" Iteraciones: {iteration}")
|
||||
|
||||
print("\n")
|
||||
return results
|
||||
|
||||
def print_summary(results, duration):
|
||||
"""Imprimir resumen de resultados"""
|
||||
print("\n" + "=" * 70)
|
||||
print("📊 RESUMEN DEL STRESS TEST")
|
||||
print("=" * 70)
|
||||
print(f"Duración total: {duration:.2f} segundos")
|
||||
print(f"Iteraciones completadas: {results['iterations']}")
|
||||
print(f"Errores: {results['errors']}")
|
||||
print()
|
||||
|
||||
if results['matmul_times']:
|
||||
matmul_avg = sum(results['matmul_times']) / len(results['matmul_times'])
|
||||
matmul_min = min(results['matmul_times'])
|
||||
matmul_max = max(results['matmul_times'])
|
||||
matmul_last10_avg = sum(results['matmul_times'][-10:]) / len(results['matmul_times'][-10:])
|
||||
print(f"🔢 MATRIZ MULTIPLICACIÓN (2048-8192)")
|
||||
print(f" Promedio: {matmul_avg*1000:8.2f} ms")
|
||||
print(f" Últimas 10: {matmul_last10_avg*1000:8.2f} ms")
|
||||
print(f" Mínimo: {matmul_min*1000:8.2f} ms")
|
||||
print(f" Máximo: {matmul_max*1000:8.2f} ms")
|
||||
print()
|
||||
|
||||
if results['conv_times']:
|
||||
conv_avg = sum(results['conv_times']) / len(results['conv_times'])
|
||||
conv_min = min(results['conv_times'])
|
||||
conv_max = max(results['conv_times'])
|
||||
print(f"🧮 CONVOLUCIÓN 3D (32x128x64³)")
|
||||
print(f" Promedio: {conv_avg*1000:8.2f} ms")
|
||||
print(f" Mínimo: {conv_min*1000:8.2f} ms")
|
||||
print(f" Máximo: {conv_max*1000:8.2f} ms")
|
||||
print()
|
||||
|
||||
if results['reLU_times']:
|
||||
relu_avg = sum(results['reLU_times']) / len(results['reLU_times'])
|
||||
relu_min = min(results['reLU_times'])
|
||||
relu_max = max(results['reLU_times'])
|
||||
print(f"⚡ ReLU + BatchNorm")
|
||||
print(f" Promedio: {relu_avg*1000:8.4f} ms")
|
||||
print(f" Mínimo: {relu_min*1000:8.4f} ms")
|
||||
print(f" Máximo: {relu_max*1000:8.4f} ms")
|
||||
print()
|
||||
|
||||
if results['softmax_times']:
|
||||
softmax_avg = sum(results['softmax_times']) / len(results['softmax_times'])
|
||||
softmax_min = min(results['softmax_times'])
|
||||
softmax_max = max(results['softmax_times'])
|
||||
print(f"🔥 Softmax (2048x2048)")
|
||||
print(f" Promedio: {softmax_avg*1000:8.2f} ms")
|
||||
print(f" Mínimo: {softmax_min*1000:8.2f} ms")
|
||||
print(f" Máximo: {softmax_max*1000:8.2f} ms")
|
||||
print()
|
||||
|
||||
# Performance score
|
||||
total_ops = results['iterations']
|
||||
if total_ops > 0 and duration > 0:
|
||||
print("=" * 70)
|
||||
print(f"✅ TEST COMPLETADO")
|
||||
print(f" 📈 {total_ops} operaciones en {duration:.1f} segundos")
|
||||
print(f" ⚡ {total_ops/duration:.2f} operaciones/segundo")
|
||||
print(f" 💾 Uso de VRAM: Hasta ~85% (configurado)")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Calcular GFLOPS aproximado para matmul
|
||||
if results['matmul_times']:
|
||||
# GFLOPS = 2 * n^3 / (time * 10^9) para matriz n x n
|
||||
avg_matmul_ms = (sum(results['matmul_times']) / len(results['matmul_times'])) * 1000
|
||||
avg_n = sum([2048 if i%3==0 else 4096 if i%5==0 else 2048 for i in range(len(results['matmul_times']))]) / len(results['matmul_times'])
|
||||
gflops = (2 * (avg_n**3)) / (avg_matmul_ms / 1000) / 1e9
|
||||
print(f"🚀 RENDIMIENTO ESTIMADO")
|
||||
print(f" ~{gflops:.2f} GFLOPS (matriz multiplicación)")
|
||||
|
||||
def main():
|
||||
print_header()
|
||||
|
||||
# Verificar ROCm
|
||||
if not torch.cuda.is_available():
|
||||
print("❌ ERROR: ROCm/CUDA no está disponible!")
|
||||
print(" Ejecuta: export HSA_OVERRIDE_GFX_VERSION=10.3.0")
|
||||
sys.exit(1)
|
||||
|
||||
# Ejecutar stress test
|
||||
duration = 120 # 2 minutos
|
||||
start = time.time()
|
||||
results = stress_test(duration)
|
||||
actual_duration = time.time() - start
|
||||
|
||||
# Mostrar resumen
|
||||
print_summary(results, actual_duration)
|
||||
|
||||
# Limpiar
|
||||
torch.cuda.empty_cache()
|
||||
print("🧹 Cache de GPU limpiado")
|
||||
print("\n✅ Stress test finalizado")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
7
api/__init__.py
Normal file
7
api/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
API package for CBCFacil
|
||||
"""
|
||||
|
||||
from .routes import create_app
|
||||
|
||||
__all__ = ['create_app']
|
||||
280
api/routes.py
Normal file
280
api/routes.py
Normal file
@@ -0,0 +1,280 @@
|
||||
"""
|
||||
Flask API routes for CBCFacil dashboard
|
||||
"""
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List
|
||||
from flask import Flask, render_template, request, jsonify, send_from_directory
|
||||
from flask_cors import CORS
|
||||
|
||||
from ..config import settings
|
||||
from ..storage import processed_registry
|
||||
from ..services.webdav_service import webdav_service
|
||||
from ..services import vram_manager
|
||||
|
||||
|
||||
def create_app() -> Flask:
|
||||
"""Create and configure Flask application"""
|
||||
app = Flask(__name__)
|
||||
CORS(app)
|
||||
|
||||
# Configure app
|
||||
app.config['SECRET_KEY'] = settings.DASHBOARD_SECRET_KEY or os.urandom(24)
|
||||
app.config['DOWNLOADS_FOLDER'] = str(settings.LOCAL_DOWNLOADS_PATH)
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
"""Dashboard home page"""
|
||||
return render_template('index.html')
|
||||
|
||||
@app.route('/api/files')
|
||||
def get_files():
|
||||
"""Get list of audio files"""
|
||||
try:
|
||||
files = get_audio_files()
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'files': files,
|
||||
'total': len(files),
|
||||
'processed': sum(1 for f in files if f['processed']),
|
||||
'pending': sum(1 for f in files if not f['processed'])
|
||||
})
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error getting files: {e}")
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': f"Error: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@app.route('/api/reprocess', methods=['POST'])
|
||||
def reprocess_file():
|
||||
"""Reprocess a file"""
|
||||
try:
|
||||
data = request.get_json()
|
||||
file_path = data.get('path')
|
||||
source = data.get('source', 'local')
|
||||
|
||||
if not file_path:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': "Path del archivo es requerido"
|
||||
}), 400
|
||||
|
||||
# TODO: Implement file reprocessing
|
||||
# This would trigger the main processing loop
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f"Archivo {Path(file_path).name} enviado a reprocesamiento"
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error reprocessing file: {e}")
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': f"Error: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@app.route('/api/mark-unprocessed', methods=['POST'])
|
||||
def mark_unprocessed():
|
||||
"""Mark file as unprocessed"""
|
||||
try:
|
||||
data = request.get_json()
|
||||
file_path = data.get('path')
|
||||
|
||||
if not file_path:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': "Path del archivo es requerido"
|
||||
}), 400
|
||||
|
||||
success = processed_registry.remove(file_path)
|
||||
|
||||
if success:
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': "Archivo marcado como no procesado"
|
||||
})
|
||||
else:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': "No se pudo marcar como no procesado"
|
||||
}), 500
|
||||
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error marking unprocessed: {e}")
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': f"Error: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@app.route('/api/refresh')
|
||||
def refresh_files():
|
||||
"""Refresh file list"""
|
||||
try:
|
||||
processed_registry.load()
|
||||
files = get_audio_files()
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': "Lista de archivos actualizada",
|
||||
'files': files
|
||||
})
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error refreshing files: {e}")
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': f"Error: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@app.route('/downloads/<path:filename>')
|
||||
def download_file(filename):
|
||||
"""Download file"""
|
||||
try:
|
||||
# Validate path to prevent traversal and injection attacks
|
||||
normalized = Path(filename).resolve()
|
||||
base_downloads = Path(str(settings.LOCAL_DOWNLOADS_PATH)).resolve()
|
||||
base_docx = Path(str(settings.LOCAL_DOCX)).resolve()
|
||||
if '..' in filename or filename.startswith('/') or \
|
||||
normalized.parts[0] in ['..', '...'] if len(normalized.parts) > 0 else False or \
|
||||
not (normalized == base_downloads or normalized.is_relative_to(base_downloads) or
|
||||
normalized == base_docx or normalized.is_relative_to(base_docx)):
|
||||
return jsonify({'error': 'Invalid filename'}), 400
|
||||
|
||||
# Try downloads directory
|
||||
downloads_path = settings.LOCAL_DOWNLOADS_PATH / filename
|
||||
if downloads_path.exists():
|
||||
return send_from_directory(str(settings.LOCAL_DOWNLOADS_PATH), filename)
|
||||
|
||||
# Try resumenes_docx directory
|
||||
docx_path = settings.LOCAL_DOCX / filename
|
||||
if docx_path.exists():
|
||||
return send_from_directory(str(settings.LOCAL_DOCX), filename)
|
||||
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error downloading file: {e}")
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
@app.route('/health')
|
||||
def health_check():
|
||||
"""Health check endpoint"""
|
||||
gpu_info = vram_manager.get_usage()
|
||||
|
||||
return jsonify({
|
||||
'status': 'healthy',
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'processed_files_count': processed_registry.count(),
|
||||
'gpu': gpu_info,
|
||||
'config': {
|
||||
'webdav_configured': settings.has_webdav_config,
|
||||
'ai_configured': settings.has_ai_config,
|
||||
'debug': settings.DEBUG
|
||||
}
|
||||
})
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def get_audio_files() -> List[Dict[str, Any]]:
|
||||
"""Get list of audio files from WebDAV and local"""
|
||||
files = []
|
||||
|
||||
# Get files from WebDAV
|
||||
if settings.has_webdav_config:
|
||||
try:
|
||||
webdav_files = webdav_service.list(settings.REMOTE_AUDIOS_FOLDER)
|
||||
for file_path in webdav_files:
|
||||
normalized_path = webdav_service.normalize_path(file_path)
|
||||
base_name = Path(normalized_path).name
|
||||
|
||||
if any(normalized_path.lower().endswith(ext) for ext in settings.AUDIO_EXTENSIONS):
|
||||
is_processed = processed_registry.is_processed(normalized_path)
|
||||
|
||||
files.append({
|
||||
'filename': base_name,
|
||||
'path': normalized_path,
|
||||
'source': 'webdav',
|
||||
'processed': is_processed,
|
||||
'size': 'Unknown',
|
||||
'last_modified': 'Unknown',
|
||||
'available_formats': get_available_formats(base_name)
|
||||
})
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error getting WebDAV files: {e}")
|
||||
|
||||
# Get local files
|
||||
try:
|
||||
if settings.LOCAL_DOWNLOADS_PATH.exists():
|
||||
for ext in settings.AUDIO_EXTENSIONS:
|
||||
for file_path in settings.LOCAL_DOWNLOADS_PATH.glob(f"*{ext}"):
|
||||
stat = file_path.stat()
|
||||
is_processed = processed_registry.is_processed(file_path.name)
|
||||
|
||||
files.append({
|
||||
'filename': file_path.name,
|
||||
'path': str(file_path),
|
||||
'source': 'local',
|
||||
'processed': is_processed,
|
||||
'size': format_size(stat.st_size),
|
||||
'last_modified': datetime.fromtimestamp(stat.st_mtime).strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'available_formats': get_available_formats(file_path.name)
|
||||
})
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error getting local files: {e}")
|
||||
|
||||
# Remove duplicates (WebDAV takes precedence)
|
||||
unique_files = {}
|
||||
for file in files:
|
||||
key = file['filename']
|
||||
if key not in unique_files or file['source'] == 'webdav':
|
||||
unique_files[key] = file
|
||||
|
||||
return sorted(unique_files.values(), key=lambda x: x['filename'])
|
||||
|
||||
|
||||
def get_available_formats(audio_filename: str) -> Dict[str, bool]:
|
||||
"""Check which output formats are available for an audio file"""
|
||||
base_name = Path(audio_filename).stem
|
||||
|
||||
formats = {
|
||||
'txt': False,
|
||||
'md': False,
|
||||
'pdf': False,
|
||||
'docx': False
|
||||
}
|
||||
|
||||
directories_to_check = [
|
||||
settings.LOCAL_DOWNLOADS_PATH,
|
||||
settings.LOCAL_DOCX
|
||||
]
|
||||
|
||||
for directory in directories_to_check:
|
||||
if not directory.exists():
|
||||
continue
|
||||
|
||||
for ext in formats.keys():
|
||||
name_variants = [
|
||||
base_name,
|
||||
f"{base_name}_unificado",
|
||||
base_name.replace(' ', '_'),
|
||||
f"{base_name.replace(' ', '_')}_unificado",
|
||||
]
|
||||
|
||||
for name_variant in name_variants:
|
||||
file_path = directory / f"{name_variant}.{ext}"
|
||||
if file_path.exists():
|
||||
formats[ext] = True
|
||||
break
|
||||
|
||||
return formats
|
||||
|
||||
|
||||
def format_size(size_bytes: int) -> str:
|
||||
"""Format size in human-readable format"""
|
||||
for unit in ['B', 'KB', 'MB', 'GB']:
|
||||
if size_bytes < 1024.0:
|
||||
return f"{size_bytes:.1f} {unit}"
|
||||
size_bytes /= 1024.0
|
||||
return f"{size_bytes:.1f} TB"
|
||||
69
backup/originals_20250109/ai_service.py
Normal file
69
backup/originals_20250109/ai_service.py
Normal file
@@ -0,0 +1,69 @@
|
||||
"""
|
||||
AI Service - Unified interface for AI providers
|
||||
"""
|
||||
import logging
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from .config import settings
|
||||
from .core import AIProcessingError
|
||||
from .services.ai.provider_factory import AIProviderFactory, ai_provider_factory
|
||||
|
||||
|
||||
class AIService:
|
||||
"""Unified service for AI operations with provider fallback"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._factory: Optional[AIProviderFactory] = None
|
||||
|
||||
@property
|
||||
def factory(self) -> AIProviderFactory:
|
||||
"""Lazy initialization of provider factory"""
|
||||
if self._factory is None:
|
||||
self._factory = ai_provider_factory
|
||||
return self._factory
|
||||
|
||||
def generate_text(
|
||||
self,
|
||||
prompt: str,
|
||||
provider: Optional[str] = None,
|
||||
max_tokens: int = 4096
|
||||
) -> str:
|
||||
"""Generate text using AI provider"""
|
||||
try:
|
||||
ai_provider = self.factory.get_provider(provider or 'gemini')
|
||||
return ai_provider.generate(prompt, max_tokens=max_tokens)
|
||||
except AIProcessingError as e:
|
||||
self.logger.error(f"AI generation failed: {e}")
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
def summarize(self, text: str, **kwargs) -> str:
|
||||
"""Generate summary of text"""
|
||||
try:
|
||||
provider = self.factory.get_best_provider()
|
||||
return provider.summarize(text, **kwargs)
|
||||
except AIProcessingError as e:
|
||||
self.logger.error(f"Summarization failed: {e}")
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
def correct_text(self, text: str, **kwargs) -> str:
|
||||
"""Correct grammar and spelling in text"""
|
||||
try:
|
||||
provider = self.factory.get_best_provider()
|
||||
return provider.correct_text(text, **kwargs)
|
||||
except AIProcessingError as e:
|
||||
self.logger.error(f"Text correction failed: {e}")
|
||||
return text # Return original on error
|
||||
|
||||
def classify_content(self, text: str, **kwargs) -> Dict[str, Any]:
|
||||
"""Classify content into categories"""
|
||||
try:
|
||||
provider = self.factory.get_best_provider()
|
||||
return provider.classify_content(text, **kwargs)
|
||||
except AIProcessingError as e:
|
||||
self.logger.error(f"Classification failed: {e}")
|
||||
return {"category": "otras_clases", "confidence": 0.0}
|
||||
|
||||
|
||||
# Global instance
|
||||
ai_service = AIService()
|
||||
171
backup/originals_20250109/gemini_provider.py
Normal file
171
backup/originals_20250109/gemini_provider.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
Gemini AI Provider implementation
|
||||
"""
|
||||
import logging
|
||||
import subprocess
|
||||
import shutil
|
||||
import requests
|
||||
import time
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from ..config import settings
|
||||
from ..core import AIProcessingError
|
||||
from .base_provider import AIProvider
|
||||
|
||||
|
||||
class GeminiProvider(AIProvider):
|
||||
"""Gemini AI provider using CLI or API"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._cli_path = settings.GEMINI_CLI_PATH or shutil.which("gemini")
|
||||
self._api_key = settings.GEMINI_API_KEY
|
||||
self._flash_model = settings.GEMINI_FLASH_MODEL
|
||||
self._pro_model = settings.GEMINI_PRO_MODEL
|
||||
self._session = None
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "Gemini"
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if Gemini is available"""
|
||||
return bool(self._cli_path or self._api_key)
|
||||
|
||||
def _run_cli(self, prompt: str, use_flash: bool = True, timeout: int = 300) -> str:
|
||||
"""Run Gemini CLI with prompt"""
|
||||
if not self._cli_path:
|
||||
raise AIProcessingError("Gemini CLI not available")
|
||||
|
||||
model = self._flash_model if use_flash else self._pro_model
|
||||
cmd = [self._cli_path, model, prompt]
|
||||
|
||||
try:
|
||||
process = subprocess.run(
|
||||
cmd,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
timeout=timeout,
|
||||
shell=False
|
||||
)
|
||||
|
||||
if process.returncode != 0:
|
||||
error_msg = process.stderr or "Unknown error"
|
||||
raise AIProcessingError(f"Gemini CLI failed: {error_msg}")
|
||||
|
||||
return process.stdout.strip()
|
||||
except subprocess.TimeoutExpired:
|
||||
raise AIProcessingError(f"Gemini CLI timed out after {timeout}s")
|
||||
except Exception as e:
|
||||
raise AIProcessingError(f"Gemini CLI error: {e}")
|
||||
|
||||
def _call_api(self, prompt: str, use_flash: bool = True, timeout: int = 180) -> str:
|
||||
"""Call Gemini API"""
|
||||
if not self._api_key:
|
||||
raise AIProcessingError("Gemini API key not configured")
|
||||
|
||||
model = self._flash_model if use_flash else self._pro_model
|
||||
|
||||
# Initialize session if needed
|
||||
if self._session is None:
|
||||
self._session = requests.Session()
|
||||
adapter = requests.adapters.HTTPAdapter(
|
||||
pool_connections=10,
|
||||
pool_maxsize=20
|
||||
)
|
||||
self._session.mount('https://', adapter)
|
||||
|
||||
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
|
||||
|
||||
payload = {
|
||||
"contents": [{
|
||||
"parts": [{"text": prompt}]
|
||||
}]
|
||||
}
|
||||
|
||||
params = {"key": self._api_key}
|
||||
|
||||
try:
|
||||
response = self._session.post(
|
||||
url,
|
||||
json=payload,
|
||||
params=params,
|
||||
timeout=timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
|
||||
if "candidates" not in data or not data["candidates"]:
|
||||
raise AIProcessingError("Empty response from Gemini API")
|
||||
|
||||
candidate = data["candidates"][0]
|
||||
if "content" not in candidate or "parts" not in candidate["content"]:
|
||||
raise AIProcessingError("Invalid response format from Gemini API")
|
||||
|
||||
result = candidate["content"]["parts"][0]["text"]
|
||||
return result.strip()
|
||||
|
||||
except requests.RequestException as e:
|
||||
raise AIProcessingError(f"Gemini API request failed: {e}")
|
||||
except (KeyError, IndexError, ValueError) as e:
|
||||
raise AIProcessingError(f"Gemini API response error: {e}")
|
||||
|
||||
def _run(self, prompt: str, use_flash: bool = True, timeout: int = 300) -> str:
|
||||
"""Run Gemini with fallback between CLI and API"""
|
||||
# Try CLI first if available
|
||||
if self._cli_path:
|
||||
try:
|
||||
return self._run_cli(prompt, use_flash, timeout)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Gemini CLI failed, trying API: {e}")
|
||||
|
||||
# Fallback to API
|
||||
if self._api_key:
|
||||
api_timeout = timeout if timeout < 180 else 180
|
||||
return self._call_api(prompt, use_flash, api_timeout)
|
||||
|
||||
raise AIProcessingError("No Gemini provider available (CLI or API)")
|
||||
|
||||
def summarize(self, text: str, **kwargs) -> str:
|
||||
"""Generate summary using Gemini"""
|
||||
prompt = f"""Summarize the following text:
|
||||
|
||||
{text}
|
||||
|
||||
Provide a clear, concise summary in Spanish."""
|
||||
return self._run(prompt, use_flash=True)
|
||||
|
||||
def correct_text(self, text: str, **kwargs) -> str:
|
||||
"""Correct text using Gemini"""
|
||||
prompt = f"""Correct the following text for grammar, spelling, and clarity:
|
||||
|
||||
{text}
|
||||
|
||||
Return only the corrected text, nothing else."""
|
||||
return self._run(prompt, use_flash=True)
|
||||
|
||||
def classify_content(self, text: str, **kwargs) -> Dict[str, Any]:
|
||||
"""Classify content using Gemini"""
|
||||
categories = ["historia", "analisis_contable", "instituciones_gobierno", "otras_clases"]
|
||||
|
||||
prompt = f"""Classify the following text into one of these categories:
|
||||
- historia
|
||||
- analisis_contable
|
||||
- instituciones_gobierno
|
||||
- otras_clases
|
||||
|
||||
Text: {text}
|
||||
|
||||
Return only the category name, nothing else."""
|
||||
result = self._run(prompt, use_flash=True).lower()
|
||||
|
||||
# Validate result
|
||||
if result not in categories:
|
||||
result = "otras_clases"
|
||||
|
||||
return {
|
||||
"category": result,
|
||||
"confidence": 0.9,
|
||||
"provider": self.name
|
||||
}
|
||||
137
backup/originals_20250109/processed_registry.py
Normal file
137
backup/originals_20250109/processed_registry.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Processed files registry using repository pattern
|
||||
"""
|
||||
import fcntl
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Set, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from ..config import settings
|
||||
|
||||
|
||||
class ProcessedRegistry:
|
||||
"""Registry for tracking processed files with caching and file locking"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._cache: Set[str] = set()
|
||||
self._cache_time: Optional[datetime] = None
|
||||
self._cache_ttl = 60
|
||||
self._initialized = False
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Initialize the registry"""
|
||||
self.load()
|
||||
self._initialized = True
|
||||
|
||||
def load(self) -> Set[str]:
|
||||
"""Load processed files from disk with caching"""
|
||||
now = datetime.utcnow()
|
||||
if self._cache and self._cache_time and (now - self._cache_time).total_seconds() < self._cache_ttl:
|
||||
return self._cache.copy()
|
||||
|
||||
processed = set()
|
||||
registry_path = settings.processed_files_path
|
||||
|
||||
try:
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if registry_path.exists():
|
||||
with open(registry_path, 'r', encoding='utf-8') as f:
|
||||
for raw_line in f:
|
||||
line = raw_line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
processed.add(line)
|
||||
base_name = Path(line).name
|
||||
processed.add(base_name)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error reading processed files registry: {e}")
|
||||
|
||||
self._cache = processed
|
||||
self._cache_time = now
|
||||
return processed.copy()
|
||||
|
||||
def save(self, file_path: str) -> None:
|
||||
"""Add file to processed registry with file locking"""
|
||||
if not file_path:
|
||||
return
|
||||
registry_path = settings.processed_files_path
|
||||
|
||||
try:
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(registry_path, 'a', encoding='utf-8') as f:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
if file_path not in self._cache:
|
||||
f.write(file_path + "\n")
|
||||
self._cache.add(file_path)
|
||||
self.logger.debug(f"Added {file_path} to processed registry")
|
||||
finally:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error saving to processed files registry: {e}")
|
||||
raise
|
||||
|
||||
def is_processed(self, file_path: str) -> bool:
|
||||
"""Check if file has been processed"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
if file_path in self._cache:
|
||||
return True
|
||||
basename = Path(file_path).name
|
||||
if basename in self._cache:
|
||||
return True
|
||||
return False
|
||||
|
||||
def remove(self, file_path: str) -> bool:
|
||||
"""Remove file from processed registry"""
|
||||
registry_path = settings.processed_files_path
|
||||
|
||||
try:
|
||||
if not registry_path.exists():
|
||||
return False
|
||||
lines_to_keep = []
|
||||
with open(registry_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if line.strip() != file_path and Path(line.strip()).name != Path(file_path).name:
|
||||
lines_to_keep.append(line)
|
||||
with open(registry_path, 'w', encoding='utf-8') as f:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
f.writelines(lines_to_keep)
|
||||
self._cache.discard(file_path)
|
||||
self._cache.discard(Path(file_path).name)
|
||||
finally:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error removing from processed files registry: {e}")
|
||||
return False
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear the entire registry"""
|
||||
registry_path = settings.processed_files_path
|
||||
try:
|
||||
if registry_path.exists():
|
||||
registry_path.unlink()
|
||||
self._cache.clear()
|
||||
self._cache_time = None
|
||||
self.logger.info("Processed files registry cleared")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error clearing processed files registry: {e}")
|
||||
raise
|
||||
|
||||
def get_all(self) -> Set[str]:
|
||||
"""Get all processed files"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return self._cache.copy()
|
||||
|
||||
def count(self) -> int:
|
||||
"""Get count of processed files"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return len(self._cache)
|
||||
|
||||
|
||||
# Global instance
|
||||
processed_registry = ProcessedRegistry()
|
||||
8
config/__init__.py
Normal file
8
config/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
Configuration package for CBCFacil
|
||||
"""
|
||||
|
||||
from .settings import settings
|
||||
from .validators import validate_environment
|
||||
|
||||
__all__ = ['settings', 'validate_environment']
|
||||
211
config/settings.py
Normal file
211
config/settings.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""
|
||||
Centralized configuration management for CBCFacil
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional, Set, Union
|
||||
|
||||
|
||||
class ConfigurationError(Exception):
|
||||
"""Raised when configuration is invalid"""
|
||||
pass
|
||||
|
||||
|
||||
class Settings:
|
||||
"""Application settings loaded from environment variables"""
|
||||
|
||||
# Application
|
||||
APP_NAME: str = "CBCFacil"
|
||||
APP_VERSION: str = "8.0"
|
||||
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
|
||||
|
||||
# Nextcloud/WebDAV Configuration
|
||||
NEXTCLOUD_URL: str = os.getenv("NEXTCLOUD_URL", "")
|
||||
NEXTCLOUD_USER: str = os.getenv("NEXTCLOUD_USER", "")
|
||||
NEXTCLOUD_PASSWORD: str = os.getenv("NEXTCLOUD_PASSWORD", "")
|
||||
WEBDAV_ENDPOINT: str = NEXTCLOUD_URL
|
||||
|
||||
# Remote folders
|
||||
REMOTE_AUDIOS_FOLDER: str = "Audios"
|
||||
REMOTE_DOCX_AUDIO_FOLDER: str = "Documentos"
|
||||
REMOTE_PDF_FOLDER: str = "Pdf"
|
||||
REMOTE_TXT_FOLDER: str = "Textos"
|
||||
RESUMENES_FOLDER: str = "Resumenes"
|
||||
DOCX_FOLDER: str = "Documentos"
|
||||
|
||||
# Local paths
|
||||
BASE_DIR: Path = Path(__file__).resolve().parent.parent
|
||||
LOCAL_STATE_DIR: str = os.getenv("LOCAL_STATE_DIR", str(BASE_DIR))
|
||||
LOCAL_DOWNLOADS_PATH: Path = BASE_DIR / "downloads"
|
||||
LOCAL_RESUMENES: Path = LOCAL_DOWNLOADS_PATH
|
||||
LOCAL_DOCX: Path = BASE_DIR / "resumenes_docx"
|
||||
|
||||
# Processing
|
||||
POLL_INTERVAL: int = int(os.getenv("POLL_INTERVAL", "5"))
|
||||
HTTP_TIMEOUT: int = int(os.getenv("HTTP_TIMEOUT", "30"))
|
||||
WEBDAV_MAX_RETRIES: int = int(os.getenv("WEBDAV_MAX_RETRIES", "3"))
|
||||
DOWNLOAD_CHUNK_SIZE: int = int(os.getenv("DOWNLOAD_CHUNK_SIZE", "65536")) # 64KB for better performance
|
||||
MAX_FILENAME_LENGTH: int = int(os.getenv("MAX_FILENAME_LENGTH", "80"))
|
||||
MAX_FILENAME_BASE_LENGTH: int = int(os.getenv("MAX_FILENAME_BASE_LENGTH", "40"))
|
||||
MAX_FILENAME_TOPICS_LENGTH: int = int(os.getenv("MAX_FILENAME_TOPICS_LENGTH", "20"))
|
||||
|
||||
# File extensions
|
||||
AUDIO_EXTENSIONS: Set[str] = {".mp3", ".wav", ".m4a", ".ogg", ".aac"}
|
||||
PDF_EXTENSIONS: Set[str] = {".pdf"}
|
||||
TXT_EXTENSIONS: Set[str] = {".txt"}
|
||||
|
||||
# AI Providers
|
||||
ZAI_BASE_URL: str = os.getenv("ZAI_BASE_URL", "https://api.z.ai/api/anthropic")
|
||||
ZAI_DEFAULT_MODEL: str = os.getenv("ZAI_MODEL", "glm-4.6")
|
||||
ZAI_AUTH_TOKEN: Optional[str] = os.getenv("ANTHROPIC_AUTH_TOKEN") or os.getenv("ZAI_AUTH_TOKEN", "")
|
||||
|
||||
# Gemini
|
||||
GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
|
||||
GEMINI_FLASH_MODEL: Optional[str] = os.getenv("GEMINI_FLASH_MODEL")
|
||||
GEMINI_PRO_MODEL: Optional[str] = os.getenv("GEMINI_PRO_MODEL")
|
||||
|
||||
# CLI paths
|
||||
GEMINI_CLI_PATH: Optional[str] = os.getenv("GEMINI_CLI_PATH")
|
||||
CLAUDE_CLI_PATH: Optional[str] = os.getenv("CLAUDE_CLI_PATH")
|
||||
|
||||
# Telegram
|
||||
TELEGRAM_TOKEN: Optional[str] = os.getenv("TELEGRAM_TOKEN")
|
||||
TELEGRAM_CHAT_ID: Optional[str] = os.getenv("TELEGRAM_CHAT_ID")
|
||||
|
||||
# PDF Processing Configuration
|
||||
CPU_COUNT: int = os.cpu_count() or 1
|
||||
PDF_MAX_PAGES_PER_CHUNK: int = int(os.getenv("PDF_MAX_PAGES_PER_CHUNK", "2"))
|
||||
PDF_DPI: int = int(os.getenv("PDF_DPI", "200"))
|
||||
PDF_RENDER_THREAD_COUNT: int = int(os.getenv("PDF_RENDER_THREAD_COUNT", str(min(4, CPU_COUNT))))
|
||||
PDF_BATCH_SIZE: int = int(os.getenv("PDF_BATCH_SIZE", "2"))
|
||||
PDF_TROCR_MAX_BATCH: int = int(os.getenv("PDF_TROCR_MAX_BATCH", str(PDF_BATCH_SIZE)))
|
||||
PDF_TESSERACT_THREADS: int = int(os.getenv("PDF_TESSERACT_THREADS", str(max(1, min(2, max(1, CPU_COUNT // 3))))))
|
||||
PDF_PREPROCESS_THREADS: int = int(os.getenv("PDF_PREPROCESS_THREADS", str(PDF_TESSERACT_THREADS)))
|
||||
PDF_TEXT_DETECTION_MIN_RATIO: float = float(os.getenv("PDF_TEXT_DETECTION_MIN_RATIO", "0.6"))
|
||||
PDF_TEXT_DETECTION_MIN_AVG_CHARS: int = int(os.getenv("PDF_TEXT_DETECTION_MIN_AVG_CHARS", "120"))
|
||||
|
||||
# Error handling
|
||||
ERROR_THROTTLE_SECONDS: int = int(os.getenv("ERROR_THROTTLE_SECONDS", "600"))
|
||||
|
||||
# GPU/VRAM Management
|
||||
MODEL_TIMEOUT_SECONDS: int = int(os.getenv("MODEL_TIMEOUT_SECONDS", "300"))
|
||||
CUDA_VISIBLE_DEVICES: str = os.getenv("CUDA_VISIBLE_DEVICES", "all")
|
||||
PYTORCH_CUDA_ALLOC_CONF: str = os.getenv("PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:512")
|
||||
|
||||
# GPU Detection (auto, nvidia, amd, cpu)
|
||||
GPU_PREFERENCE: str = os.getenv("GPU_PREFERENCE", "auto")
|
||||
# AMD ROCm HSA override for RX 6000 series (gfx1030)
|
||||
HSA_OVERRIDE_GFX_VERSION: str = os.getenv("HSA_OVERRIDE_GFX_VERSION", "10.3.0")
|
||||
|
||||
# Dashboard
|
||||
DASHBOARD_SECRET_KEY: str = os.getenv("DASHBOARD_SECRET_KEY", "")
|
||||
DASHBOARD_PORT: int = int(os.getenv("DASHBOARD_PORT", "5000"))
|
||||
DASHBOARD_HOST: str = os.getenv("DASHBOARD_HOST", "0.0.0.0")
|
||||
|
||||
# Logging
|
||||
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
|
||||
LOG_FILE: Optional[str] = os.getenv("LOG_FILE")
|
||||
|
||||
# Threading optimization
|
||||
OMP_NUM_THREADS: int = int(os.getenv("OMP_NUM_THREADS", "4"))
|
||||
MKL_NUM_THREADS: int = int(os.getenv("MKL_NUM_THREADS", "4"))
|
||||
|
||||
# ========================================================================
|
||||
# PROPERTIES WITH VALIDATION
|
||||
# ========================================================================
|
||||
|
||||
@property
|
||||
def is_production(self) -> bool:
|
||||
"""Check if running in production mode"""
|
||||
return not self.DEBUG
|
||||
|
||||
@property
|
||||
def has_webdav_config(self) -> bool:
|
||||
"""Check if WebDAV credentials are configured"""
|
||||
return all([self.NEXTCLOUD_URL, self.NEXTCLOUD_USER, self.NEXTCLOUD_PASSWORD])
|
||||
|
||||
@property
|
||||
def has_ai_config(self) -> bool:
|
||||
"""Check if AI providers are configured"""
|
||||
return any([
|
||||
self.ZAI_AUTH_TOKEN,
|
||||
self.GEMINI_API_KEY,
|
||||
self.CLAUDE_CLI_PATH,
|
||||
self.GEMINI_CLI_PATH
|
||||
])
|
||||
|
||||
@property
|
||||
def processed_files_path(self) -> Path:
|
||||
"""Get the path to the processed files registry"""
|
||||
return Path(os.getenv("PROCESSED_FILES_PATH", str(Path(self.LOCAL_STATE_DIR) / "processed_files.txt")))
|
||||
|
||||
@property
|
||||
def nextcloud_url(self) -> str:
|
||||
"""Get Nextcloud URL with validation"""
|
||||
if not self.NEXTCLOUD_URL and self.is_production:
|
||||
raise ConfigurationError("NEXTCLOUD_URL is required in production mode")
|
||||
return self.NEXTCLOUD_URL
|
||||
|
||||
@property
|
||||
def nextcloud_user(self) -> str:
|
||||
"""Get Nextcloud username with validation"""
|
||||
if not self.NEXTCLOUD_USER and self.is_production:
|
||||
raise ConfigurationError("NEXTCLOUD_USER is required in production mode")
|
||||
return self.NEXTCLOUD_USER
|
||||
|
||||
@property
|
||||
def nextcloud_password(self) -> str:
|
||||
"""Get Nextcloud password with validation"""
|
||||
if not self.NEXTCLOUD_PASSWORD and self.is_production:
|
||||
raise ConfigurationError("NEXTCLOUD_PASSWORD is required in production mode")
|
||||
return self.NEXTCLOUD_PASSWORD
|
||||
|
||||
@property
|
||||
def valid_webdav_config(self) -> bool:
|
||||
"""Validate WebDAV configuration completeness"""
|
||||
try:
|
||||
_ = self.nextcloud_url
|
||||
_ = self.nextcloud_user
|
||||
_ = self.nextcloud_password
|
||||
return True
|
||||
except ConfigurationError:
|
||||
return False
|
||||
|
||||
@property
|
||||
def telegram_configured(self) -> bool:
|
||||
"""Check if Telegram is properly configured"""
|
||||
return bool(self.TELEGRAM_TOKEN and self.TELEGRAM_CHAT_ID)
|
||||
|
||||
@property
|
||||
def has_gpu_support(self) -> bool:
|
||||
"""Check if GPU support is available"""
|
||||
try:
|
||||
import torch
|
||||
return torch.cuda.is_available()
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
@property
|
||||
def environment_type(self) -> str:
|
||||
"""Get environment type as string"""
|
||||
return "production" if self.is_production else "development"
|
||||
|
||||
@property
|
||||
def config_summary(self) -> dict:
|
||||
"""Get configuration summary for logging"""
|
||||
return {
|
||||
"app_name": self.APP_NAME,
|
||||
"version": self.APP_VERSION,
|
||||
"environment": self.environment_type,
|
||||
"debug": self.DEBUG,
|
||||
"webdav_configured": self.has_webdav_config,
|
||||
"ai_configured": self.has_ai_config,
|
||||
"telegram_configured": self.telegram_configured,
|
||||
"gpu_support": self.has_gpu_support,
|
||||
"cpu_count": self.CPU_COUNT,
|
||||
"poll_interval": self.POLL_INTERVAL
|
||||
}
|
||||
|
||||
|
||||
# Create global settings instance
|
||||
settings = Settings()
|
||||
130
config/settings.py.backup
Normal file
130
config/settings.py.backup
Normal file
@@ -0,0 +1,130 @@
|
||||
"""
|
||||
Centralized configuration management for CBCFacil
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional, Set
|
||||
|
||||
|
||||
class Settings:
|
||||
"""Application settings loaded from environment variables"""
|
||||
|
||||
# Application
|
||||
APP_NAME: str = "CBCFacil"
|
||||
APP_VERSION: str = "8.0"
|
||||
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
|
||||
|
||||
# Nextcloud/WebDAV Configuration
|
||||
NEXTCLOUD_URL: str = os.getenv("NEXTCLOUD_URL", "")
|
||||
NEXTCLOUD_USER: str = os.getenv("NEXTCLOUD_USER", "")
|
||||
NEXTCLOUD_PASSWORD: str = os.getenv("NEXTCLOUD_PASSWORD", "")
|
||||
WEBDAV_ENDPOINT: str = NEXTCLOUD_URL
|
||||
|
||||
# Remote folders
|
||||
REMOTE_AUDIOS_FOLDER: str = "Audios"
|
||||
REMOTE_DOCX_AUDIO_FOLDER: str = "Documentos"
|
||||
REMOTE_PDF_FOLDER: str = "Pdf"
|
||||
REMOTE_TXT_FOLDER: str = "Textos"
|
||||
RESUMENES_FOLDER: str = "Resumenes"
|
||||
DOCX_FOLDER: str = "Documentos"
|
||||
|
||||
# Local paths
|
||||
BASE_DIR: Path = Path(__file__).resolve().parent.parent
|
||||
LOCAL_STATE_DIR: str = os.getenv("LOCAL_STATE_DIR", str(BASE_DIR))
|
||||
LOCAL_DOWNLOADS_PATH: Path = BASE_DIR / "downloads"
|
||||
LOCAL_RESUMENES: Path = LOCAL_DOWNLOADS_PATH
|
||||
LOCAL_DOCX: Path = BASE_DIR / "resumenes_docx"
|
||||
|
||||
# Processing
|
||||
POLL_INTERVAL: int = int(os.getenv("POLL_INTERVAL", "5"))
|
||||
HTTP_TIMEOUT: int = int(os.getenv("HTTP_TIMEOUT", "30"))
|
||||
WEBDAV_MAX_RETRIES: int = int(os.getenv("WEBDAV_MAX_RETRIES", "3"))
|
||||
DOWNLOAD_CHUNK_SIZE: int = int(os.getenv("DOWNLOAD_CHUNK_SIZE", "65536")) # 64KB for better performance
|
||||
MAX_FILENAME_LENGTH: int = int(os.getenv("MAX_FILENAME_LENGTH", "80"))
|
||||
MAX_FILENAME_BASE_LENGTH: int = int(os.getenv("MAX_FILENAME_BASE_LENGTH", "40"))
|
||||
MAX_FILENAME_TOPICS_LENGTH: int = int(os.getenv("MAX_FILENAME_TOPICS_LENGTH", "20"))
|
||||
|
||||
# File extensions
|
||||
AUDIO_EXTENSIONS: Set[str] = {".mp3", ".wav", ".m4a", ".ogg", ".aac"}
|
||||
PDF_EXTENSIONS: Set[str] = {".pdf"}
|
||||
TXT_EXTENSIONS: Set[str] = {".txt"}
|
||||
|
||||
# AI Providers
|
||||
ZAI_BASE_URL: str = os.getenv("ZAI_BASE_URL", "https://api.z.ai/api/anthropic")
|
||||
ZAI_DEFAULT_MODEL: str = os.getenv("ZAI_MODEL", "glm-4.6")
|
||||
ZAI_AUTH_TOKEN: Optional[str] = os.getenv("ANTHROPIC_AUTH_TOKEN") or os.getenv("ZAI_AUTH_TOKEN", "")
|
||||
|
||||
# Gemini
|
||||
GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
|
||||
GEMINI_FLASH_MODEL: Optional[str] = os.getenv("GEMINI_FLASH_MODEL")
|
||||
GEMINI_PRO_MODEL: Optional[str] = os.getenv("GEMINI_PRO_MODEL")
|
||||
|
||||
# CLI paths
|
||||
GEMINI_CLI_PATH: Optional[str] = os.getenv("GEMINI_CLI_PATH")
|
||||
CLAUDE_CLI_PATH: Optional[str] = os.getenv("CLAUDE_CLI_PATH")
|
||||
|
||||
# Telegram
|
||||
TELEGRAM_TOKEN: Optional[str] = os.getenv("TELEGRAM_TOKEN")
|
||||
TELEGRAM_CHAT_ID: Optional[str] = os.getenv("TELEGRAM_CHAT_ID")
|
||||
|
||||
# PDF Processing Configuration
|
||||
CPU_COUNT: int = os.cpu_count() or 1
|
||||
PDF_MAX_PAGES_PER_CHUNK: int = int(os.getenv("PDF_MAX_PAGES_PER_CHUNK", "2"))
|
||||
PDF_DPI: int = int(os.getenv("PDF_DPI", "200"))
|
||||
PDF_RENDER_THREAD_COUNT: int = int(os.getenv("PDF_RENDER_THREAD_COUNT", str(min(4, CPU_COUNT))))
|
||||
PDF_BATCH_SIZE: int = int(os.getenv("PDF_BATCH_SIZE", "2"))
|
||||
PDF_TROCR_MAX_BATCH: int = int(os.getenv("PDF_TROCR_MAX_BATCH", str(PDF_BATCH_SIZE)))
|
||||
PDF_TESSERACT_THREADS: int = int(os.getenv("PDF_TESSERACT_THREADS", str(max(1, min(2, max(1, CPU_COUNT // 3))))))
|
||||
PDF_PREPROCESS_THREADS: int = int(os.getenv("PDF_PREPROCESS_THREADS", str(PDF_TESSERACT_THREADS)))
|
||||
PDF_TEXT_DETECTION_MIN_RATIO: float = float(os.getenv("PDF_TEXT_DETECTION_MIN_RATIO", "0.6"))
|
||||
PDF_TEXT_DETECTION_MIN_AVG_CHARS: int = int(os.getenv("PDF_TEXT_DETECTION_MIN_AVG_CHARS", "120"))
|
||||
|
||||
# Error handling
|
||||
ERROR_THROTTLE_SECONDS: int = int(os.getenv("ERROR_THROTTLE_SECONDS", "600"))
|
||||
|
||||
# GPU/VRAM Management
|
||||
MODEL_TIMEOUT_SECONDS: int = int(os.getenv("MODEL_TIMEOUT_SECONDS", "300"))
|
||||
CUDA_VISIBLE_DEVICES: str = os.getenv("CUDA_VISIBLE_DEVICES", "all")
|
||||
PYTORCH_CUDA_ALLOC_CONF: str = os.getenv("PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:512")
|
||||
|
||||
# Dashboard
|
||||
DASHBOARD_SECRET_KEY: str = os.getenv("DASHBOARD_SECRET_KEY", "")
|
||||
DASHBOARD_PORT: int = int(os.getenv("DASHBOARD_PORT", "5000"))
|
||||
DASHBOARD_HOST: str = os.getenv("DASHBOARD_HOST", "0.0.0.0")
|
||||
|
||||
# Logging
|
||||
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
|
||||
LOG_FILE: Optional[str] = os.getenv("LOG_FILE")
|
||||
|
||||
# Threading optimization
|
||||
OMP_NUM_THREADS: int = int(os.getenv("OMP_NUM_THREADS", "4"))
|
||||
MKL_NUM_THREADS: int = int(os.getenv("MKL_NUM_THREADS", "4"))
|
||||
|
||||
@property
|
||||
def is_production(self) -> bool:
|
||||
"""Check if running in production mode"""
|
||||
return not self.DEBUG
|
||||
|
||||
@property
|
||||
def has_webdav_config(self) -> bool:
|
||||
"""Check if WebDAV credentials are configured"""
|
||||
return all([self.NEXTCLOUD_URL, self.NEXTCLOUD_USER, self.NEXTCLOUD_PASSWORD])
|
||||
|
||||
@property
|
||||
def has_ai_config(self) -> bool:
|
||||
"""Check if AI providers are configured"""
|
||||
return any([
|
||||
self.ZAI_AUTH_TOKEN,
|
||||
self.GEMINI_API_KEY,
|
||||
self.CLAUDE_CLI_PATH,
|
||||
self.GEMINI_CLI_PATH
|
||||
])
|
||||
|
||||
@property
|
||||
def processed_files_path(self) -> Path:
|
||||
"""Get the path to the processed files registry"""
|
||||
return Path(os.getenv("PROCESSED_FILES_PATH", str(Path(self.LOCAL_STATE_DIR) / "processed_files.txt")))
|
||||
|
||||
|
||||
# Create global settings instance
|
||||
settings = Settings()
|
||||
64
config/validators.py
Normal file
64
config/validators.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""
|
||||
Configuration validators for CBCFacil
|
||||
"""
|
||||
import logging
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
class ConfigurationError(Exception):
|
||||
"""Raised when configuration is invalid"""
|
||||
pass
|
||||
|
||||
|
||||
def validate_environment() -> List[str]:
|
||||
"""
|
||||
Validate required environment variables and configuration.
|
||||
Returns a list of warnings/errors.
|
||||
"""
|
||||
from .settings import settings
|
||||
|
||||
warnings = []
|
||||
errors = []
|
||||
|
||||
# Check critical configurations
|
||||
if not settings.has_webdav_config:
|
||||
warnings.append("WebDAV credentials not configured - file sync will not work")
|
||||
|
||||
if not settings.has_ai_config:
|
||||
warnings.append("No AI providers configured - summary generation will not work")
|
||||
|
||||
# Validate API keys format if provided
|
||||
if settings.ZAI_AUTH_TOKEN:
|
||||
if len(settings.ZAI_AUTH_TOKEN) < 10:
|
||||
errors.append("ZAI_AUTH_TOKEN appears to be invalid (too short)")
|
||||
|
||||
if settings.GEMINI_API_KEY:
|
||||
if len(settings.GEMINI_API_KEY) < 20:
|
||||
errors.append("GEMINI_API_KEY appears to be invalid (too short)")
|
||||
|
||||
# Validate dashboard secret
|
||||
if not settings.DASHBOARD_SECRET_KEY:
|
||||
warnings.append("DASHBOARD_SECRET_KEY not set - using default is not recommended for production")
|
||||
|
||||
if settings.DASHBOARD_SECRET_KEY == "dashboard-secret-key-change-in-production":
|
||||
warnings.append("Using default dashboard secret key - please change in production")
|
||||
|
||||
# Check CUDA availability
|
||||
try:
|
||||
import torch
|
||||
if not torch.cuda.is_available():
|
||||
warnings.append("CUDA not available - GPU acceleration will be disabled")
|
||||
except ImportError:
|
||||
warnings.append("PyTorch not installed - GPU acceleration will be disabled")
|
||||
|
||||
# Print warnings
|
||||
for warning in warnings:
|
||||
logging.warning(f"Configuration warning: {warning}")
|
||||
|
||||
# Raise error if critical issues
|
||||
if errors:
|
||||
error_msg = "Configuration errors:\n" + "\n".join(f"- {e}" for e in errors)
|
||||
logging.error(error_msg)
|
||||
raise ConfigurationError(error_msg)
|
||||
|
||||
return warnings
|
||||
21
core/__init__.py
Normal file
21
core/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""
|
||||
Core package for CBCFacil
|
||||
"""
|
||||
|
||||
from .exceptions import (
|
||||
ProcessingError,
|
||||
WebDAVError,
|
||||
AIProcessingError,
|
||||
ConfigurationError,
|
||||
FileProcessingError
|
||||
)
|
||||
from .result import Result
|
||||
|
||||
__all__ = [
|
||||
'ProcessingError',
|
||||
'WebDAVError',
|
||||
'AIProcessingError',
|
||||
'ConfigurationError',
|
||||
'FileProcessingError',
|
||||
'Result'
|
||||
]
|
||||
35
core/base_service.py
Normal file
35
core/base_service.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""
|
||||
Base service class for CBCFacil services
|
||||
"""
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class BaseService(ABC):
|
||||
"""Base class for all services"""
|
||||
|
||||
def __init__(self, name: str):
|
||||
self.name = name
|
||||
self.logger = logging.getLogger(f"{__name__}.{name}")
|
||||
|
||||
@abstractmethod
|
||||
def initialize(self) -> None:
|
||||
"""Initialize the service"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup service resources"""
|
||||
pass
|
||||
|
||||
def health_check(self) -> bool:
|
||||
"""Perform health check"""
|
||||
return True
|
||||
|
||||
def __enter__(self):
|
||||
self.initialize()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.cleanup()
|
||||
38
core/exceptions.py
Normal file
38
core/exceptions.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""
|
||||
Custom exceptions for CBCFacil
|
||||
"""
|
||||
|
||||
|
||||
class ProcessingError(Exception):
|
||||
"""Base exception for all processing errors"""
|
||||
pass
|
||||
|
||||
|
||||
class ConfigurationError(ProcessingError):
|
||||
"""Raised when configuration is invalid"""
|
||||
pass
|
||||
|
||||
|
||||
class WebDAVError(ProcessingError):
|
||||
"""Raised when WebDAV operations fail"""
|
||||
pass
|
||||
|
||||
|
||||
class AIProcessingError(ProcessingError):
|
||||
"""Raised when AI processing fails"""
|
||||
pass
|
||||
|
||||
|
||||
class FileProcessingError(ProcessingError):
|
||||
"""Raised when file processing fails"""
|
||||
pass
|
||||
|
||||
|
||||
class AuthenticationError(ProcessingError):
|
||||
"""Raised when authentication fails"""
|
||||
pass
|
||||
|
||||
|
||||
class ValidationError(ProcessingError):
|
||||
"""Raised when input validation fails"""
|
||||
pass
|
||||
355
core/health_check.py
Normal file
355
core/health_check.py
Normal file
@@ -0,0 +1,355 @@
|
||||
"""
|
||||
Health check endpoint for CBCFacil service monitoring
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HealthChecker:
|
||||
"""Comprehensive health check for all service dependencies"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def check_webdav_connection(self) -> Dict[str, Any]:
|
||||
"""Check WebDAV service connectivity"""
|
||||
from config import settings
|
||||
|
||||
result = {
|
||||
"service": "webdav",
|
||||
"status": "unknown",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
try:
|
||||
from services.webdav_service import webdav_service
|
||||
|
||||
if not settings.has_webdav_config:
|
||||
result["status"] = "not_configured"
|
||||
result["message"] = "WebDAV credentials not configured"
|
||||
return result
|
||||
|
||||
# Test connection with a simple list operation
|
||||
webdav_service.list(".")
|
||||
|
||||
result["status"] = "healthy"
|
||||
result["message"] = "WebDAV connection successful"
|
||||
result["endpoint"] = settings.NEXTCLOUD_URL
|
||||
|
||||
except Exception as e:
|
||||
result["status"] = "unhealthy"
|
||||
result["error"] = str(e)
|
||||
self.logger.error(f"WebDAV health check failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
def check_ai_providers(self) -> Dict[str, Any]:
|
||||
"""Check AI provider configurations"""
|
||||
from config import settings
|
||||
|
||||
result = {
|
||||
"service": "ai_providers",
|
||||
"status": "unknown",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"providers": {}
|
||||
}
|
||||
|
||||
try:
|
||||
# Check ZAI
|
||||
if settings.ZAI_AUTH_TOKEN:
|
||||
result["providers"]["zai"] = {
|
||||
"configured": True,
|
||||
"status": "unknown"
|
||||
}
|
||||
else:
|
||||
result["providers"]["zai"] = {
|
||||
"configured": False,
|
||||
"status": "not_configured"
|
||||
}
|
||||
|
||||
# Check Gemini
|
||||
if settings.GEMINI_API_KEY:
|
||||
result["providers"]["gemini"] = {
|
||||
"configured": True,
|
||||
"status": "unknown"
|
||||
}
|
||||
else:
|
||||
result["providers"]["gemini"] = {
|
||||
"configured": False,
|
||||
"status": "not_configured"
|
||||
}
|
||||
|
||||
# Check CLI providers
|
||||
if settings.CLAUDE_CLI_PATH:
|
||||
claude_path = Path(settings.CLAUDE_CLI_PATH)
|
||||
result["providers"]["claude_cli"] = {
|
||||
"configured": True,
|
||||
"path_exists": claude_path.exists(),
|
||||
"status": "available" if claude_path.exists() else "path_invalid"
|
||||
}
|
||||
|
||||
if settings.GEMINI_CLI_PATH:
|
||||
gemini_path = Path(settings.GEMINI_CLI_PATH)
|
||||
result["providers"]["gemini_cli"] = {
|
||||
"configured": True,
|
||||
"path_exists": gemini_path.exists(),
|
||||
"status": "available" if gemini_path.exists() else "path_invalid"
|
||||
}
|
||||
|
||||
# Overall status
|
||||
if settings.has_ai_config:
|
||||
result["status"] = "healthy"
|
||||
result["message"] = "At least one AI provider configured"
|
||||
else:
|
||||
result["status"] = "not_configured"
|
||||
result["message"] = "No AI providers configured"
|
||||
|
||||
except Exception as e:
|
||||
result["status"] = "error"
|
||||
result["error"] = str(e)
|
||||
self.logger.error(f"AI providers health check failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
def check_vram_manager(self) -> Dict[str, Any]:
|
||||
"""Check VRAM manager status"""
|
||||
result = {
|
||||
"service": "vram_manager",
|
||||
"status": "unknown",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
try:
|
||||
from services.vram_manager import vram_manager
|
||||
|
||||
vram_info = vram_manager.get_vram_info()
|
||||
|
||||
result["status"] = "healthy"
|
||||
result["vram_info"] = {
|
||||
"total_gb": round(vram_info.get("total", 0) / (1024**3), 2),
|
||||
"free_gb": round(vram_info.get("free", 0) / (1024**3), 2),
|
||||
"allocated_gb": round(vram_info.get("allocated", 0) / (1024**3), 2)
|
||||
}
|
||||
result["cuda_available"] = vram_info.get("cuda_available", False)
|
||||
|
||||
except Exception as e:
|
||||
result["status"] = "unavailable"
|
||||
result["error"] = str(e)
|
||||
self.logger.error(f"VRAM manager health check failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
def check_telegram_service(self) -> Dict[str, Any]:
|
||||
"""Check Telegram service status"""
|
||||
from config import settings
|
||||
|
||||
result = {
|
||||
"service": "telegram",
|
||||
"status": "unknown",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
try:
|
||||
from services.telegram_service import telegram_service
|
||||
|
||||
if telegram_service.is_configured:
|
||||
result["status"] = "healthy"
|
||||
result["message"] = "Telegram service configured"
|
||||
else:
|
||||
result["status"] = "not_configured"
|
||||
result["message"] = "Telegram credentials not configured"
|
||||
|
||||
except Exception as e:
|
||||
result["status"] = "error"
|
||||
result["error"] = str(e)
|
||||
self.logger.error(f"Telegram service health check failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
def check_processed_registry(self) -> Dict[str, Any]:
|
||||
"""Check processed files registry"""
|
||||
result = {
|
||||
"service": "processed_registry",
|
||||
"status": "unknown",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
try:
|
||||
from storage.processed_registry import processed_registry
|
||||
|
||||
# Try to load registry
|
||||
processed_registry.load()
|
||||
|
||||
result["status"] = "healthy"
|
||||
result["registry_path"] = str(processed_registry.registry_path)
|
||||
|
||||
# Check if registry file is writable
|
||||
registry_file = Path(processed_registry.registry_path)
|
||||
if registry_file.exists():
|
||||
result["registry_exists"] = True
|
||||
result["registry_writable"] = registry_file.is_file() and os.access(registry_file, os.W_OK)
|
||||
else:
|
||||
result["registry_exists"] = False
|
||||
|
||||
except Exception as e:
|
||||
result["status"] = "unhealthy"
|
||||
result["error"] = str(e)
|
||||
self.logger.error(f"Processed registry health check failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
def check_disk_space(self) -> Dict[str, Any]:
|
||||
"""Check available disk space"""
|
||||
result = {
|
||||
"service": "disk_space",
|
||||
"status": "unknown",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
try:
|
||||
import shutil
|
||||
|
||||
# Check main directory
|
||||
usage = shutil.disk_usage(Path(__file__).parent.parent)
|
||||
|
||||
total_gb = usage.total / (1024**3)
|
||||
free_gb = usage.free / (1024**3)
|
||||
used_percent = (usage.used / usage.total) * 100
|
||||
|
||||
result["status"] = "healthy"
|
||||
result["total_gb"] = round(total_gb, 2)
|
||||
result["free_gb"] = round(free_gb, 2)
|
||||
result["used_percent"] = round(used_percent, 2)
|
||||
|
||||
# Warning if low disk space
|
||||
if free_gb < 1: # Less than 1GB
|
||||
result["status"] = "warning"
|
||||
result["message"] = "Low disk space"
|
||||
elif free_gb < 5: # Less than 5GB
|
||||
result["status"] = "degraded"
|
||||
result["message"] = "Disk space running low"
|
||||
|
||||
except Exception as e:
|
||||
result["status"] = "error"
|
||||
result["error"] = str(e)
|
||||
self.logger.error(f"Disk space health check failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
def check_configuration(self) -> Dict[str, Any]:
|
||||
"""Check configuration validity"""
|
||||
from config import settings
|
||||
|
||||
result = {
|
||||
"service": "configuration",
|
||||
"status": "unknown",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
try:
|
||||
warnings = []
|
||||
|
||||
# Check for warnings
|
||||
if not settings.has_webdav_config:
|
||||
warnings.append("WebDAV not configured")
|
||||
|
||||
if not settings.has_ai_config:
|
||||
warnings.append("AI providers not configured")
|
||||
|
||||
if not settings.telegram_configured:
|
||||
warnings.append("Telegram not configured")
|
||||
|
||||
if settings.DASHBOARD_SECRET_KEY == "":
|
||||
warnings.append("Dashboard secret key not set")
|
||||
|
||||
if settings.DASHBOARD_SECRET_KEY == "dashboard-secret-key-change-in-production":
|
||||
warnings.append("Using default dashboard secret")
|
||||
|
||||
result["status"] = "healthy" if not warnings else "warning"
|
||||
result["warnings"] = warnings
|
||||
result["environment"] = settings.environment_type
|
||||
|
||||
except Exception as e:
|
||||
result["status"] = "error"
|
||||
result["error"] = str(e)
|
||||
self.logger.error(f"Configuration health check failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
def run_full_health_check(self) -> Dict[str, Any]:
|
||||
"""Run all health checks and return comprehensive status"""
|
||||
checks = [
|
||||
("configuration", self.check_configuration),
|
||||
("webdav", self.check_webdav_connection),
|
||||
("ai_providers", self.check_ai_providers),
|
||||
("vram_manager", self.check_vram_manager),
|
||||
("telegram", self.check_telegram_service),
|
||||
("processed_registry", self.check_processed_registry),
|
||||
("disk_space", self.check_disk_space)
|
||||
]
|
||||
|
||||
results = {}
|
||||
overall_status = "healthy"
|
||||
|
||||
for check_name, check_func in checks:
|
||||
try:
|
||||
result = check_func()
|
||||
results[check_name] = result
|
||||
|
||||
# Track overall status
|
||||
if result["status"] in ["unhealthy", "error"]:
|
||||
overall_status = "unhealthy"
|
||||
elif result["status"] in ["warning", "degraded"] and overall_status == "healthy":
|
||||
overall_status = "warning"
|
||||
|
||||
except Exception as e:
|
||||
results[check_name] = {
|
||||
"service": check_name,
|
||||
"status": "error",
|
||||
"error": str(e),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
overall_status = "unhealthy"
|
||||
self.logger.error(f"Health check {check_name} failed: {e}")
|
||||
|
||||
return {
|
||||
"overall_status": overall_status,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"checks": results,
|
||||
"summary": {
|
||||
"total_checks": len(checks),
|
||||
"healthy": sum(1 for r in results.values() if r["status"] == "healthy"),
|
||||
"warning": sum(1 for r in results.values() if r["status"] == "warning"),
|
||||
"unhealthy": sum(1 for r in results.values() if r["status"] == "unhealthy")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Convenience function for CLI usage
|
||||
def get_health_status() -> Dict[str, Any]:
|
||||
"""Get comprehensive health status"""
|
||||
checker = HealthChecker()
|
||||
return checker.run_full_health_check()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# CLI usage: python core/health_check.py
|
||||
import sys
|
||||
import os
|
||||
|
||||
health = get_health_status()
|
||||
|
||||
print(json.dumps(health, indent=2))
|
||||
|
||||
# Exit with appropriate code
|
||||
if health["overall_status"] == "healthy":
|
||||
sys.exit(0)
|
||||
elif health["overall_status"] == "warning":
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.exit(2)
|
||||
43
core/result.py
Normal file
43
core/result.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""
|
||||
Result type for handling success/error cases
|
||||
"""
|
||||
from typing import TypeVar, Generic, Optional, Callable
|
||||
from dataclasses import dataclass
|
||||
|
||||
T = TypeVar('T')
|
||||
E = TypeVar('E')
|
||||
|
||||
|
||||
@dataclass
|
||||
class Success(Generic[T]):
|
||||
"""Successful result with value"""
|
||||
value: T
|
||||
|
||||
def is_success(self) -> bool:
|
||||
return True
|
||||
|
||||
def is_error(self) -> bool:
|
||||
return False
|
||||
|
||||
def map(self, func: Callable[[T], 'Success']) -> 'Success[T]':
|
||||
"""Apply function to value"""
|
||||
return func(self.value)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Error(Generic[E]):
|
||||
"""Error result with error value"""
|
||||
error: E
|
||||
|
||||
def is_success(self) -> bool:
|
||||
return False
|
||||
|
||||
def is_error(self) -> bool:
|
||||
return True
|
||||
|
||||
def map(self, func: Callable) -> 'Error[E]':
|
||||
"""Return self on error"""
|
||||
return self
|
||||
|
||||
|
||||
Result = Success[T] | Error[E]
|
||||
417
dashboard.py
417
dashboard.py
@@ -1,417 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dashboard Flask para gestión de archivos de audio
|
||||
Interfaz web simple para reprocesar archivos MP3 con 1 click
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from flask import Flask, render_template, request, jsonify, send_from_directory
|
||||
from flask_cors import CORS
|
||||
|
||||
# Importar configuraciones del main.py sin rutas absolutas
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.append(str(PROJECT_ROOT))
|
||||
from main import (
|
||||
AUDIO_EXTENSIONS, LOCAL_DOWNLOADS_PATH, PROCESSED_FILES_PATH,
|
||||
load_processed_files, save_processed_file, process_audio_file,
|
||||
REMOTE_AUDIOS_FOLDER, webdav_list, normalize_remote_path
|
||||
)
|
||||
|
||||
app = Flask(__name__)
|
||||
CORS(app)
|
||||
|
||||
# Configuración
|
||||
app.config['SECRET_KEY'] = os.getenv('DASHBOARD_SECRET_KEY', 'dashboard-secret-key-change-in-production')
|
||||
app.config['DOWNLOADS_FOLDER'] = LOCAL_DOWNLOADS_PATH
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class FileManager:
|
||||
"""Gestor de archivos para el dashboard"""
|
||||
|
||||
def __init__(self):
|
||||
self.processed_files = set()
|
||||
self.load_processed_files()
|
||||
|
||||
def load_processed_files(self):
|
||||
"""Cargar archivos procesados desde el registro"""
|
||||
try:
|
||||
self.processed_files = load_processed_files()
|
||||
logger.info(f"Cargados {len(self.processed_files)} archivos procesados")
|
||||
except Exception as e:
|
||||
logger.error(f"Error cargando archivos procesados: {e}")
|
||||
self.processed_files = set()
|
||||
|
||||
def get_audio_files(self) -> List[Dict[str, Any]]:
|
||||
"""Obtener lista de archivos de audio disponibles"""
|
||||
files = []
|
||||
|
||||
# Obtener archivos de WebDAV
|
||||
try:
|
||||
webdav_files = webdav_list(REMOTE_AUDIOS_FOLDER)
|
||||
for file_path in webdav_files:
|
||||
normalized_path = normalize_remote_path(file_path)
|
||||
base_name = os.path.basename(normalized_path)
|
||||
|
||||
if any(normalized_path.lower().endswith(ext) for ext in AUDIO_EXTENSIONS):
|
||||
available_formats = self._get_available_formats(base_name)
|
||||
# Considerar procesado si está en el registro O si tiene archivos de salida
|
||||
is_processed = (normalized_path in self.processed_files or
|
||||
base_name in self.processed_files or
|
||||
any(available_formats.values()))
|
||||
|
||||
files.append({
|
||||
'filename': base_name,
|
||||
'path': normalized_path,
|
||||
'source': 'webdav',
|
||||
'processed': is_processed,
|
||||
'size': 'Unknown',
|
||||
'last_modified': 'Unknown',
|
||||
'available_formats': available_formats
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error obteniendo archivos WebDAV: {e}")
|
||||
|
||||
# Obtener archivos locales
|
||||
try:
|
||||
if os.path.exists(LOCAL_DOWNLOADS_PATH):
|
||||
local_files = []
|
||||
for ext in AUDIO_EXTENSIONS:
|
||||
local_files.extend(Path(LOCAL_DOWNLOADS_PATH).glob(f"*{ext}"))
|
||||
|
||||
for file_path in local_files:
|
||||
stat = file_path.stat()
|
||||
available_formats = self._get_available_formats(file_path.name)
|
||||
# Considerar procesado si está en el registro O si tiene archivos de salida
|
||||
is_processed = (file_path.name in self.processed_files or
|
||||
any(available_formats.values()))
|
||||
|
||||
files.append({
|
||||
'filename': file_path.name,
|
||||
'path': str(file_path),
|
||||
'source': 'local',
|
||||
'processed': is_processed,
|
||||
'size': self._format_size(stat.st_size),
|
||||
'last_modified': datetime.fromtimestamp(stat.st_mtime).strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'available_formats': available_formats
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error obteniendo archivos locales: {e}")
|
||||
|
||||
# Eliminar duplicados y ordenar
|
||||
unique_files = {}
|
||||
for file in files:
|
||||
key = file['filename']
|
||||
if key not in unique_files or file['source'] == 'webdav':
|
||||
unique_files[key] = file
|
||||
|
||||
return sorted(unique_files.values(), key=lambda x: x['filename'])
|
||||
|
||||
def _get_available_formats(self, audio_filename: str) -> Dict[str, bool]:
|
||||
"""Verificar qué formatos de salida existen para un archivo de audio"""
|
||||
# Obtener el nombre base sin extensión
|
||||
base_name = Path(audio_filename).stem
|
||||
|
||||
# Extensiones a verificar
|
||||
formats = {
|
||||
'txt': False,
|
||||
'md': False,
|
||||
'pdf': False,
|
||||
'docx': False
|
||||
}
|
||||
|
||||
# Verificar en directorio local y resumenes_docx
|
||||
directories_to_check = [LOCAL_DOWNLOADS_PATH, './resumenes_docx']
|
||||
|
||||
for directory in directories_to_check:
|
||||
if not os.path.exists(directory):
|
||||
continue
|
||||
|
||||
for ext in formats.keys():
|
||||
# Buscar variaciones del nombre del archivo
|
||||
name_variants = [
|
||||
base_name, # Nombre exacto
|
||||
f"{base_name}_unificado", # Con sufijo _unificado
|
||||
f"{base_name.replace(' ', '_')}", # Espacios reemplazados por guiones bajos
|
||||
f"{base_name.replace(' ', '_')}_unificado", # Ambas variaciones
|
||||
]
|
||||
|
||||
# También verificar variantes con espacios originales pero _unificado
|
||||
if ' ' in base_name:
|
||||
name_variants.append(f"{base_name}_unificado")
|
||||
|
||||
# Para cada variante, verificar si existe el archivo
|
||||
for name_variant in name_variants:
|
||||
file_path = os.path.join(directory, f"{name_variant}.{ext}")
|
||||
if os.path.exists(file_path):
|
||||
formats[ext] = True
|
||||
break # Encontrado, pasar al siguiente formato
|
||||
|
||||
return formats
|
||||
|
||||
def _format_size(self, size_bytes: int) -> str:
|
||||
"""Formatear tamaño de archivo"""
|
||||
for unit in ['B', 'KB', 'MB', 'GB']:
|
||||
if size_bytes < 1024.0:
|
||||
return f"{size_bytes:.1f} {unit}"
|
||||
size_bytes /= 1024.0
|
||||
return f"{size_bytes:.1f} TB"
|
||||
|
||||
def reprocess_file(self, file_path: str, source: str) -> Dict[str, Any]:
|
||||
"""Reprocesar un archivo específico"""
|
||||
try:
|
||||
# Verificar formatos existentes antes de reprocesar
|
||||
filename = os.path.basename(file_path)
|
||||
existing_formats = self._get_available_formats(filename)
|
||||
has_existing_files = any(existing_formats.values())
|
||||
|
||||
if source == 'webdav':
|
||||
# Para archivos WebDAV, llamar directamente a process_audio_file
|
||||
logger.info(f"Iniciando reprocesamiento de WebDAV: {file_path}")
|
||||
process_audio_file(file_path)
|
||||
else:
|
||||
# Para archivos locales, procesar directamente
|
||||
logger.info(f"Iniciando reprocesamiento local: {file_path}")
|
||||
# Aquí podrías agregar lógica adicional para archivos locales
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'message': f"Archivo {os.path.basename(file_path)} enviado a reprocesamiento",
|
||||
'had_existing_files': has_existing_files,
|
||||
'existing_formats': existing_formats
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error reprocesando {file_path}: {e}")
|
||||
return {
|
||||
'success': False,
|
||||
'message': f"Error: {str(e)}"
|
||||
}
|
||||
|
||||
def mark_as_unprocessed(self, file_path: str) -> bool:
|
||||
"""Marcar archivo como no procesado para forzar reprocesamiento"""
|
||||
try:
|
||||
# Eliminar del registro de procesados
|
||||
processed_files = load_processed_files()
|
||||
normalized_path = normalize_remote_path(file_path)
|
||||
base_name = os.path.basename(normalized_path)
|
||||
|
||||
# Crear nuevo registro sin este archivo
|
||||
temp_path = PROCESSED_FILES_PATH + '.temp'
|
||||
with open(temp_path, 'w', encoding='utf-8') as f:
|
||||
for line in open(PROCESSED_FILES_PATH, 'r', encoding='utf-8'):
|
||||
if (line.strip() != normalized_path and
|
||||
os.path.basename(line.strip()) != base_name and
|
||||
line.strip() != file_path):
|
||||
f.write(line)
|
||||
|
||||
# Reemplazar archivo original
|
||||
os.replace(temp_path, PROCESSED_FILES_PATH)
|
||||
self.load_processed_files() # Recargar
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error marcando como no procesado {file_path}: {e}")
|
||||
return False
|
||||
|
||||
# Instancia global del gestor de archivos
|
||||
file_manager = FileManager()
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
"""Página principal del dashboard"""
|
||||
return render_template('index.html')
|
||||
|
||||
@app.route('/api/files')
|
||||
def get_files():
|
||||
"""API endpoint para obtener lista de archivos"""
|
||||
try:
|
||||
files = file_manager.get_audio_files()
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'files': files,
|
||||
'total': len(files),
|
||||
'processed': sum(1 for f in files if f['processed']),
|
||||
'pending': sum(1 for f in files if not f['processed'])
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error obteniendo archivos: {e}")
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': f"Error: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@app.route('/api/reprocess', methods=['POST'])
|
||||
def reprocess_file():
|
||||
"""API endpoint para reprocesar un archivo"""
|
||||
try:
|
||||
data = request.get_json()
|
||||
file_path = data.get('path')
|
||||
source = data.get('source', 'local')
|
||||
|
||||
if not file_path:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': "Path del archivo es requerido"
|
||||
}), 400
|
||||
|
||||
result = file_manager.reprocess_file(file_path, source)
|
||||
return jsonify(result)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error en endpoint reprocesar: {e}")
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': f"Error: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@app.route('/api/mark-unprocessed', methods=['POST'])
|
||||
def mark_unprocessed():
|
||||
"""API endpoint para marcar archivo como no procesado"""
|
||||
try:
|
||||
data = request.get_json()
|
||||
file_path = data.get('path')
|
||||
|
||||
if not file_path:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': "Path del archivo es requerido"
|
||||
}), 400
|
||||
|
||||
success = file_manager.mark_as_unprocessed(file_path)
|
||||
|
||||
if success:
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': "Archivo marcado como no procesado"
|
||||
})
|
||||
else:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': "No se pudo marcar como no procesado"
|
||||
}), 500
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error marcando como no procesado: {e}")
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': f"Error: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@app.route('/api/refresh')
|
||||
def refresh_files():
|
||||
"""API endpoint para refrescar lista de archivos"""
|
||||
try:
|
||||
file_manager.load_processed_files()
|
||||
files = file_manager.get_audio_files()
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': "Lista de archivos actualizada",
|
||||
'files': files
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error refrescando archivos: {e}")
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': f"Error: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@app.route('/downloads/find-file')
|
||||
def find_and_download_file():
|
||||
"""Buscar y servir archivos con diferentes variaciones de nombre"""
|
||||
try:
|
||||
from flask import request
|
||||
filename = request.args.get('filename')
|
||||
ext = request.args.get('ext')
|
||||
|
||||
if not filename or not ext:
|
||||
return jsonify({'error': 'Missing parameters'}), 400
|
||||
|
||||
# Generar posibles variaciones del nombre del archivo
|
||||
from pathlib import Path
|
||||
base_name = Path(filename).stem
|
||||
|
||||
possible_names = [
|
||||
f"{base_name}.{ext}",
|
||||
f"{base_name}_unificado.{ext}",
|
||||
f"{base_name.replace(' ', '_')}.{ext}",
|
||||
f"{base_name.replace(' ', '_')}_unificado.{ext}"
|
||||
]
|
||||
|
||||
# Directorios donde buscar
|
||||
directories = [LOCAL_DOWNLOADS_PATH, './resumenes_docx']
|
||||
|
||||
# Intentar encontrar el archivo en cada directorio con cada variación
|
||||
for directory in directories:
|
||||
if not os.path.exists(directory):
|
||||
continue
|
||||
|
||||
for name in possible_names:
|
||||
file_path = os.path.join(directory, name)
|
||||
if os.path.exists(file_path):
|
||||
return send_from_directory(directory, name)
|
||||
|
||||
# Si no se encuentra el archivo
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error buscando archivo: {e}")
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
@app.route('/downloads/<path:filename>')
|
||||
def download_file(filename):
|
||||
"""Servir archivos de descarga desde downloads o resumenes_docx"""
|
||||
try:
|
||||
# Primero intentar en downloads
|
||||
try:
|
||||
return send_from_directory(LOCAL_DOWNLOADS_PATH, filename)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
# Si no se encuentra en downloads, intentar en resumenes_docx
|
||||
try:
|
||||
return send_from_directory('./resumenes_docx', filename)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
# Si no se encuentra en ninguna ubicación
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error sirviendo archivo {filename}: {e}")
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
@app.route('/health')
|
||||
def health_check():
|
||||
"""Health check endpoint"""
|
||||
return jsonify({
|
||||
'status': 'healthy',
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'processed_files_count': len(file_manager.processed_files)
|
||||
})
|
||||
|
||||
if __name__ == '__main__':
|
||||
logger.info("🚀 Iniciando Dashboard de Gestión de Audio")
|
||||
logger.info(f"📁 Carpeta de descargas: {LOCAL_DOWNLOADS_PATH}")
|
||||
logger.info(f"📊 Servidor web en http://localhost:5000")
|
||||
|
||||
try:
|
||||
app.run(
|
||||
host='0.0.0.0',
|
||||
port=5000,
|
||||
debug=False,
|
||||
threaded=True,
|
||||
use_reloader=False # Evitar problemas con threading
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Dashboard detenido por el usuario")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error en dashboard: {e}")
|
||||
raise
|
||||
418
docs/DEPLOYMENT.md
Normal file
418
docs/DEPLOYMENT.md
Normal file
@@ -0,0 +1,418 @@
|
||||
# Guia de Despliegue - CBCFacil
|
||||
|
||||
Esta guia describe las opciones y procedimientos para desplegar CBCFacil en diferentes entornos.
|
||||
|
||||
## Opciones de Despliegue
|
||||
|
||||
| Metodo | Complejidad | Recomendado para |
|
||||
|--------|-------------|------------------|
|
||||
| Docker Compose | Baja | Desarrollo, Produccion ligera |
|
||||
| Docker Standalone | Media | Produccion con orquestacion |
|
||||
| Virtual Environment | Baja | Desarrollo local |
|
||||
| Kubernetes | Alta | Produccion a escala |
|
||||
|
||||
## Despliegue con Docker Compose
|
||||
|
||||
### Prerrequisitos
|
||||
|
||||
- Docker 24.0+
|
||||
- Docker Compose 2.20+
|
||||
- NVIDIA Container Toolkit (para GPU)
|
||||
|
||||
### Configuracion
|
||||
|
||||
1. Crear archivo de configuracion:
|
||||
```bash
|
||||
cp .env.example .env.production
|
||||
nano .env.production
|
||||
```
|
||||
|
||||
2. Configurar variables sensibles:
|
||||
```bash
|
||||
# .env.production
|
||||
NEXTCLOUD_URL=https://nextcloud.example.com/remote.php/webdav
|
||||
NEXTCLOUD_USER=tu_usuario
|
||||
NEXTCLOUD_PASSWORD=tu_contrasena_segura
|
||||
|
||||
ANTHROPIC_AUTH_TOKEN=sk-ant-...
|
||||
GEMINI_API_KEY=AIza...
|
||||
|
||||
TELEGRAM_TOKEN=bot_token
|
||||
TELEGRAM_CHAT_ID=chat_id
|
||||
|
||||
CUDA_VISIBLE_DEVICES=all
|
||||
LOG_LEVEL=INFO
|
||||
```
|
||||
|
||||
3. Verificar docker-compose.yml:
|
||||
```yaml
|
||||
# docker-compose.yml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
cbcfacil:
|
||||
build: .
|
||||
container_name: cbcfacil
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "5000:5000"
|
||||
volumes:
|
||||
- ./downloads:/app/downloads
|
||||
- ./resumenes_docx:/app/resumenes_docx
|
||||
- ./logs:/app/logs
|
||||
- ./data:/app/data
|
||||
environment:
|
||||
- NEXTCLOUD_URL=${NEXTCLOUD_URL}
|
||||
- NEXTCLOUD_USER=${NEXTCLOUD_USER}
|
||||
- NEXTCLOUD_PASSWORD=${NEXTCLOUD_PASSWORD}
|
||||
- ANTHROPIC_AUTH_TOKEN=${ANTHROPIC_AUTH_TOKEN}
|
||||
- GEMINI_API_KEY=${GEMINI_API_KEY}
|
||||
- TELEGRAM_TOKEN=${TELEGRAM_TOKEN}
|
||||
- TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID}
|
||||
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}
|
||||
- LOG_LEVEL=${LOG_LEVEL}
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:5000/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
```
|
||||
|
||||
### Despliegue
|
||||
|
||||
```bash
|
||||
# Construir y levantar
|
||||
docker compose up -d --build
|
||||
|
||||
# Ver logs
|
||||
docker compose logs -f cbcfacil
|
||||
|
||||
# Ver estado
|
||||
docker compose ps
|
||||
|
||||
# Reiniciar
|
||||
docker compose restart cbcfacil
|
||||
|
||||
# Detener
|
||||
docker compose down
|
||||
```
|
||||
|
||||
### Actualizacion
|
||||
|
||||
```bash
|
||||
# Hacer backup de datos
|
||||
docker cp cbcfacil:/app/data ./backup/data
|
||||
|
||||
# Actualizar imagen
|
||||
docker compose pull
|
||||
docker compose up -d --build
|
||||
|
||||
# Verificar
|
||||
docker compose logs -f cbcfacil
|
||||
```
|
||||
|
||||
## Despliegue con Docker Standalone
|
||||
|
||||
### Construir Imagen
|
||||
|
||||
```bash
|
||||
# Construir imagen
|
||||
docker build -t cbcfacil:latest .
|
||||
|
||||
# Verificar imagen
|
||||
docker images cbcfacil
|
||||
```
|
||||
|
||||
### Ejecutar Contenedor
|
||||
|
||||
```bash
|
||||
# Con GPU
|
||||
docker run -d \
|
||||
--name cbcfacil \
|
||||
--gpus all \
|
||||
-p 5000:5000 \
|
||||
-v $(pwd)/downloads:/app/downloads \
|
||||
-v $(pwd)/resumenes_docx:/app/resumenes_docx \
|
||||
-v $(pwd)/logs:/app/logs \
|
||||
-e NEXTCLOUD_URL=${NEXTCLOUD_URL} \
|
||||
-e NEXTCLOUD_USER=${NEXTCLOUD_USER} \
|
||||
-e NEXTCLOUD_PASSWORD=${NEXTCLOUD_PASSWORD} \
|
||||
-e ANTHROPIC_AUTH_TOKEN=${ANTHROPIC_AUTH_TOKEN} \
|
||||
-e GEMINI_API_KEY=${GEMINI_API_KEY} \
|
||||
cbcfacil:latest
|
||||
|
||||
# Ver logs
|
||||
docker logs -f cbcfacil
|
||||
|
||||
# Detener
|
||||
docker stop cbcfacil && docker rm cbcfacil
|
||||
```
|
||||
|
||||
## Despliegue Local (Virtual Environment)
|
||||
|
||||
### Prerrequisitos
|
||||
|
||||
- Python 3.10+
|
||||
- NVIDIA drivers (opcional)
|
||||
|
||||
### Instalacion
|
||||
|
||||
```bash
|
||||
# Clonar y entrar al directorio
|
||||
git clone <repo_url>
|
||||
cd cbcfacil
|
||||
|
||||
# Crear entorno virtual
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
|
||||
# Instalar dependencias
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Configurar variables de entorno
|
||||
cp .env.example .env.production
|
||||
nano .env.production
|
||||
|
||||
# Crear directorios
|
||||
mkdir -p downloads resumenes_docx logs data
|
||||
|
||||
# Ejecutar
|
||||
python main.py
|
||||
```
|
||||
|
||||
### Como Servicio Systemd
|
||||
|
||||
```ini
|
||||
# /etc/systemd/system/cbcfacil.service
|
||||
[Unit]
|
||||
Description=CBCFacil AI Service
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=cbcfacil
|
||||
WorkingDirectory=/opt/cbcfacil
|
||||
Environment="PATH=/opt/cbcfacil/.venv/bin"
|
||||
EnvironmentFile=/opt/cbcfacil/.env.production
|
||||
ExecStart=/opt/cbcfacil/.venv/bin/python main.py
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
# Logging
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=cbcfacil
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
```bash
|
||||
# Instalar servicio
|
||||
sudo cp cbcfacil.service /etc/systemd/system/
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable cbcfacil
|
||||
sudo systemctl start cbcfacil
|
||||
|
||||
# Verificar estado
|
||||
sudo systemctl status cbcfacil
|
||||
|
||||
# Ver logs
|
||||
journalctl -u cbcfacil -f
|
||||
```
|
||||
|
||||
## Configuracion de Produccion
|
||||
|
||||
### Variables de Entorno Criticas
|
||||
|
||||
```bash
|
||||
# Obligatorias
|
||||
NEXTCLOUD_URL=...
|
||||
NEXTCLOUD_USER=...
|
||||
NEXTCLOUD_PASSWORD=...
|
||||
|
||||
# Recomendadas para produccion
|
||||
DEBUG=false
|
||||
LOG_LEVEL=WARNING
|
||||
POLL_INTERVAL=10
|
||||
|
||||
# GPU
|
||||
CUDA_VISIBLE_DEVICES=all
|
||||
```
|
||||
|
||||
### Optimizaciones
|
||||
|
||||
```bash
|
||||
# En .env.production
|
||||
# Reducir polling para menor carga
|
||||
POLL_INTERVAL=10
|
||||
|
||||
# Optimizar memoria GPU
|
||||
PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
|
||||
|
||||
# Limitar threads
|
||||
OMP_NUM_THREADS=4
|
||||
MKL_NUM_THREADS=4
|
||||
```
|
||||
|
||||
### Seguridad
|
||||
|
||||
```bash
|
||||
# Crear usuario dedicado
|
||||
sudo useradd -r -s /bin/false cbcfacil
|
||||
|
||||
# Asignar permisos
|
||||
sudo chown -R cbcfacil:cbcfacil /opt/cbcfacil
|
||||
|
||||
# Proteger archivo de variables
|
||||
sudo chmod 600 /opt/cbcfacil/.env.production
|
||||
```
|
||||
|
||||
## Monitoreo
|
||||
|
||||
### Health Check
|
||||
|
||||
```bash
|
||||
# Endpoint de salud
|
||||
curl http://localhost:5000/health
|
||||
|
||||
# Respuesta esperada
|
||||
{"status": "healthy"}
|
||||
```
|
||||
|
||||
### Logging
|
||||
|
||||
```bash
|
||||
# Ver logs en tiempo real
|
||||
docker logs -f cbcfacil
|
||||
|
||||
# O con journalctl
|
||||
journalctl -u cbcfacil -f
|
||||
|
||||
# Logs estructurados en JSON (produccion)
|
||||
LOG_LEVEL=WARNING
|
||||
```
|
||||
|
||||
### Metricas
|
||||
|
||||
El sistema expone metricas via API:
|
||||
```bash
|
||||
# Estado del servicio
|
||||
curl http://localhost:5000/api/status
|
||||
```
|
||||
|
||||
## Respaldo y Recuperacion
|
||||
|
||||
### Respaldo de Datos
|
||||
|
||||
```bash
|
||||
# Directorios a respaldar
|
||||
# - downloads/ (archivos procesados)
|
||||
# - resumenes_docx/ (documentos generados)
|
||||
# - data/ (registros y estados)
|
||||
# - logs/ (logs del sistema)
|
||||
|
||||
# Script de backup
|
||||
#!/bin/bash
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_DIR=/backup/cbcfacil
|
||||
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
tar -czf $BACKUP_DIR/cbcfacil_$DATE.tar.gz \
|
||||
/opt/cbcfacil/downloads \
|
||||
/opt/cbcfacil/resumenes_docx \
|
||||
/opt/cbcfacil/data
|
||||
|
||||
# Limpiar backups antiguos (mantener ultimos 7 dias)
|
||||
find $BACKUP_DIR -name "*.tar.gz" -mtime +7 -delete
|
||||
```
|
||||
|
||||
### Recuperacion
|
||||
|
||||
```bash
|
||||
# Detener servicio
|
||||
docker compose down
|
||||
|
||||
# Restaurar datos
|
||||
tar -xzf backup_*.tar.gz -C /opt/cbcfacil/
|
||||
|
||||
# Verificar permisos
|
||||
chown -R cbcfacil:cbcfacil /opt/cbcfacil
|
||||
|
||||
# Reiniciar servicio
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
## Troubleshooting de Produccion
|
||||
|
||||
### Contenedor no Inicia
|
||||
|
||||
```bash
|
||||
# Verificar logs
|
||||
docker logs cbcfacil
|
||||
|
||||
# Verificar configuracion
|
||||
docker exec -it cbcfacil python -c "from config import settings; print(settings.has_webdav_config)"
|
||||
```
|
||||
|
||||
### Error de Memoria GPU
|
||||
|
||||
```bash
|
||||
# Verificar GPUs disponibles
|
||||
nvidia-smi
|
||||
|
||||
# Liberar memoria
|
||||
sudo nvidia-smi --gpu-reset
|
||||
|
||||
# O limitar GPU
|
||||
CUDA_VISIBLE_DEVICES=0
|
||||
```
|
||||
|
||||
### WebDAV Connection Failed
|
||||
|
||||
```bash
|
||||
# Verificar conectividad
|
||||
curl -u $NEXTCLOUD_USER:$NEXTCLOUD_PASSWORD $NEXTCLOUD_URL
|
||||
|
||||
# Verificar credenciales
|
||||
echo "URL: $NEXTCLOUD_URL"
|
||||
echo "User: $NEXTCLOUD_USER"
|
||||
```
|
||||
|
||||
### High CPU Usage
|
||||
|
||||
```bash
|
||||
# Reducir threads
|
||||
OMP_NUM_THREADS=2
|
||||
MKL_NUM_THREADS=2
|
||||
|
||||
# Aumentar intervalo de polling
|
||||
POLL_INTERVAL=15
|
||||
```
|
||||
|
||||
## Checklist de Produccion
|
||||
|
||||
- [ ] Variables de entorno configuradas
|
||||
- [ ] Credenciales seguras (.env.production)
|
||||
- [ ] Usuario dedicado creado
|
||||
- [ ] Permisos correctos asignados
|
||||
- [ ] Logs configurados
|
||||
- [ ] Health check funcionando
|
||||
- [ ] Backup automatizado configurado
|
||||
- [ ] Monitoreo activo
|
||||
- [ ] SSL/TLS configurado (si aplica)
|
||||
- [ ] Firewall configurado
|
||||
|
||||
## Recursos Adicionales
|
||||
|
||||
- `docs/SETUP.md` - Guia de configuracion inicial
|
||||
- `docs/TESTING.md` - Guia de testing
|
||||
- `ARCHITECTURE.md` - Documentacion arquitectonica
|
||||
337
docs/SETUP.md
Normal file
337
docs/SETUP.md
Normal file
@@ -0,0 +1,337 @@
|
||||
# Guia de Configuracion - CBCFacil
|
||||
|
||||
Esta guia describe los pasos para configurar el entorno de desarrollo de CBCFacil.
|
||||
|
||||
## Requisitos Previos
|
||||
|
||||
### Software Requerido
|
||||
|
||||
| Componente | Version Minima | Recomendada |
|
||||
|------------|----------------|-------------|
|
||||
| Python | 3.10 | 3.11+ |
|
||||
| Git | 2.0 | Latest |
|
||||
| NVIDIA Driver | 535+ | 550+ (para CUDA 12.1) |
|
||||
| Docker | 24.0 | 25.0+ (opcional) |
|
||||
| Docker Compose | 2.20 | Latest (opcional) |
|
||||
|
||||
### Hardware Recomendado
|
||||
|
||||
- **CPU**: 4+ nucleos
|
||||
- **RAM**: 8GB minimum (16GB+ recomendado)
|
||||
- **GPU**: NVIDIA con 4GB+ VRAM (opcional, soporta CPU)
|
||||
- **Almacenamiento**: 10GB+ libres
|
||||
|
||||
## Instalacion Paso a Paso
|
||||
|
||||
### 1. Clonar el Repositorio
|
||||
|
||||
```bash
|
||||
git clone <repo_url>
|
||||
cd cbcfacil
|
||||
```
|
||||
|
||||
### 2. Crear Entorno Virtual
|
||||
|
||||
```bash
|
||||
# Crear venv
|
||||
python3 -m venv .venv
|
||||
|
||||
# Activar (Linux/macOS)
|
||||
source .venv/bin/activate
|
||||
|
||||
# Activar (Windows)
|
||||
.venv\Scripts\activate
|
||||
```
|
||||
|
||||
### 3. Instalar Dependencias
|
||||
|
||||
```bash
|
||||
# Actualizar pip
|
||||
pip install --upgrade pip
|
||||
|
||||
# Instalar dependencias de produccion
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Instalar dependencias de desarrollo (opcional)
|
||||
pip install -r requirements-dev.txt
|
||||
```
|
||||
|
||||
### 4. Configurar Variables de Entorno
|
||||
|
||||
```bash
|
||||
# Copiar template de configuracion
|
||||
cp .env.example .env.secrets
|
||||
|
||||
# Editar con tus credenciales
|
||||
nano .env.secrets
|
||||
```
|
||||
|
||||
### 5. Estructura de Archivos Necesaria
|
||||
|
||||
```bash
|
||||
# Crear directorios requeridos
|
||||
mkdir -p downloads resumenes_docx logs
|
||||
|
||||
# Verificar estructura
|
||||
ls -la
|
||||
```
|
||||
|
||||
## Configuracion de Credenciales
|
||||
|
||||
### Nextcloud/WebDAV
|
||||
|
||||
```bash
|
||||
# Obligatorio para sincronizacion de archivos
|
||||
NEXTCLOUD_URL=https://tu-nextcloud.com/remote.php/webdav
|
||||
NEXTCLOUD_USER=tu_usuario
|
||||
NEXTCLOUD_PASSWORD=tu_contrasena
|
||||
```
|
||||
|
||||
### AI Providers (Opcional)
|
||||
|
||||
```bash
|
||||
# Claude via Z.ai
|
||||
ANTHROPIC_AUTH_TOKEN=sk-ant-...
|
||||
|
||||
# Google Gemini
|
||||
GEMINI_API_KEY=AIza...
|
||||
|
||||
# Gemini CLI (opcional)
|
||||
GEMINI_CLI_PATH=/usr/local/bin/gemini
|
||||
```
|
||||
|
||||
### Telegram (Opcional)
|
||||
|
||||
```bash
|
||||
TELEGRAM_TOKEN=bot_token
|
||||
TELEGRAM_CHAT_ID=chat_id
|
||||
```
|
||||
|
||||
### GPU Configuration (Opcional)
|
||||
|
||||
```bash
|
||||
# Usar GPU especifica
|
||||
CUDA_VISIBLE_DEVICES=0
|
||||
|
||||
# Usar todas las GPUs
|
||||
CUDA_VISIBLE_DEVICES=all
|
||||
|
||||
# Forzar CPU
|
||||
CUDA_VISIBLE_DEVICES=
|
||||
```
|
||||
|
||||
## Verificacion de Instalacion
|
||||
|
||||
### 1. Verificar Python
|
||||
|
||||
```bash
|
||||
python --version
|
||||
# Debe mostrar Python 3.10+
|
||||
```
|
||||
|
||||
### 2. Verificar Dependencias
|
||||
|
||||
```bash
|
||||
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
|
||||
python -c "import flask; print(f'Flask: {flask.__version__}')"
|
||||
python -c "import whisper; print('Whisper instalado correctamente')"
|
||||
```
|
||||
|
||||
### 3. Verificar Configuracion
|
||||
|
||||
```bash
|
||||
python -c "from config import settings; print(f'WebDAV configurado: {settings.has_webdav_config}')"
|
||||
python -c "from config import settings; print(f'AI configurado: {settings.has_ai_config}')"
|
||||
```
|
||||
|
||||
### 4. Test Rapido
|
||||
|
||||
```bash
|
||||
# Ejecutar tests basicos
|
||||
pytest tests/test_config.py -v
|
||||
```
|
||||
|
||||
## Configuracion de Desarrollo
|
||||
|
||||
### IDE Recomendado
|
||||
|
||||
#### VS Code
|
||||
|
||||
```json
|
||||
// .vscode/settings.json
|
||||
{
|
||||
"python.defaultInterpreterPath": ".venv/bin/python",
|
||||
"python.linting.enabled": true,
|
||||
"python.linting.pylintEnabled": true,
|
||||
"editor.formatOnSave": true,
|
||||
"python.formatting.provider": "black"
|
||||
}
|
||||
```
|
||||
|
||||
#### PyCharm
|
||||
|
||||
1. Open Settings > Project > Python Interpreter
|
||||
2. Add Interpreter > Existing Environment
|
||||
3. Select `.venv/bin/python`
|
||||
|
||||
### Git Hooks (Opcional)
|
||||
|
||||
```bash
|
||||
# Instalar pre-commit
|
||||
pip install pre-commit
|
||||
pre-commit install
|
||||
|
||||
# Verificar hooks
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
### Formateo de Codigo
|
||||
|
||||
```bash
|
||||
# Instalar formateadores
|
||||
pip install black isort
|
||||
|
||||
# Formatear codigo
|
||||
black .
|
||||
isort .
|
||||
|
||||
# Verificar estilo
|
||||
black --check .
|
||||
isort --check-only .
|
||||
```
|
||||
|
||||
## Ejecucion del Servicio
|
||||
|
||||
### Modo Desarrollo
|
||||
|
||||
```bash
|
||||
# Activar entorno virtual
|
||||
source .venv/bin/activate
|
||||
|
||||
# Ejecutar servicio completo
|
||||
python main.py
|
||||
|
||||
# Con logging verbose
|
||||
LOG_LEVEL=DEBUG python main.py
|
||||
```
|
||||
|
||||
### Comandos CLI Disponibles
|
||||
|
||||
```bash
|
||||
# Transcribir audio
|
||||
python main.py whisper <archivo_audio> <directorio_output>
|
||||
|
||||
# Procesar PDF
|
||||
python main.py pdf <archivo_pdf> <directorio_output>
|
||||
```
|
||||
|
||||
### Dashboard
|
||||
|
||||
El dashboard estara disponible en:
|
||||
- URL: http://localhost:5000
|
||||
- API: http://localhost:5000/api/
|
||||
|
||||
## Solucion de Problemas Comunes
|
||||
|
||||
### Error: "Module not found"
|
||||
|
||||
```bash
|
||||
# Verificar que el venv esta activado
|
||||
which python
|
||||
# Debe mostrar path hacia .venv/bin/python
|
||||
|
||||
# Reinstalar dependencias
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Error: "CUDA out of memory"
|
||||
|
||||
```bash
|
||||
# Reducir uso de GPU
|
||||
export CUDA_VISIBLE_DEVICES=
|
||||
|
||||
# O usar solo una GPU
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
```
|
||||
|
||||
### Error: "WebDAV connection failed"
|
||||
|
||||
```bash
|
||||
# Verificar credenciales
|
||||
echo $NEXTCLOUD_URL
|
||||
echo $NEXTCLOUD_USER
|
||||
|
||||
# Probar conexion manualmente
|
||||
curl -u $NEXTCLOUD_USER:$NEXTCLOUD_PASSWORD $NEXTCLOUD_URL
|
||||
```
|
||||
|
||||
### Error: "Telegram token invalid"
|
||||
|
||||
```bash
|
||||
# Verificar token con BotFather
|
||||
# https://t.me/BotFather
|
||||
|
||||
# Verificar variables de entorno
|
||||
echo $TELEGRAM_TOKEN
|
||||
echo $TELEGRAM_CHAT_ID
|
||||
```
|
||||
|
||||
### Error: "Whisper model not found"
|
||||
|
||||
```bash
|
||||
# El modelo se descarga automaticamente la primera vez
|
||||
# Para forzar recarga:
|
||||
rm -rf ~/.cache/whisper
|
||||
```
|
||||
|
||||
## Configuracion de GPU (Opcional)
|
||||
|
||||
### Verificar Instalacion de CUDA
|
||||
|
||||
```bash
|
||||
# Verificar drivers NVIDIA
|
||||
nvidia-smi
|
||||
|
||||
# Verificar CUDA Toolkit
|
||||
nvcc --version
|
||||
|
||||
# Verificar PyTorch CUDA
|
||||
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
|
||||
```
|
||||
|
||||
### Configurar Memoria GPU
|
||||
|
||||
```bash
|
||||
# En .env.secrets
|
||||
PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
|
||||
```
|
||||
|
||||
## Variables de Entorno Completa
|
||||
|
||||
| Variable | Requerido | Default | Descripcion |
|
||||
|----------|-----------|---------|-------------|
|
||||
| NEXTCLOUD_URL | Si | - | URL WebDAV de Nextcloud |
|
||||
| NEXTCLOUD_USER | Si | - | Usuario Nextcloud |
|
||||
| NEXTCLOUD_PASSWORD | Si | - | Contrasena Nextcloud |
|
||||
| ANTHROPIC_AUTH_TOKEN | No | - | Token Claude/Z.ai |
|
||||
| GEMINI_API_KEY | No | - | API Key Gemini |
|
||||
| TELEGRAM_TOKEN | No | - | Token Bot Telegram |
|
||||
| TELEGRAM_CHAT_ID | No | - | Chat ID Telegram |
|
||||
| CUDA_VISIBLE_DEVICES | No | "all" | GPUs a usar |
|
||||
| POLL_INTERVAL | No | 5 | Segundos entre polls |
|
||||
| LOG_LEVEL | No | "INFO" | Nivel de logging |
|
||||
| DEBUG | No | false | Modo debug |
|
||||
| DASHBOARD_PORT | No | 5000 | Puerto del dashboard |
|
||||
| DASHBOARD_HOST | No | "0.0.0.0" | Host del dashboard |
|
||||
|
||||
## Siguientes Pasos
|
||||
|
||||
1. Verificar instalacion con tests
|
||||
2. Configurar integracion con Nextcloud
|
||||
3. Probar procesamiento de archivos
|
||||
4. Configurar notificaciones Telegram (opcional)
|
||||
|
||||
Ver juga:
|
||||
- `docs/TESTING.md` - Guia de testing
|
||||
- `docs/DEPLOYMENT.md` - Guia de despliegue
|
||||
- `ARCHITECTURE.md` - Documentacion arquitectonica
|
||||
482
docs/TESTING.md
Normal file
482
docs/TESTING.md
Normal file
@@ -0,0 +1,482 @@
|
||||
# Guia de Testing - CBCFacil
|
||||
|
||||
Esta guia describe como ejecutar y escribir tests para CBCFacil.
|
||||
|
||||
## Estructura de Tests
|
||||
|
||||
```
|
||||
tests/
|
||||
├── conftest.py # Fixtures compartidos
|
||||
├── test_config.py # Tests de configuracion
|
||||
├── test_storage.py # Tests de almacenamiento
|
||||
├── test_webdav.py # Tests de WebDAV
|
||||
├── test_processors.py # Tests de procesadores
|
||||
├── test_ai_providers.py # Tests de AI providers
|
||||
├── test_vram_manager.py # Tests de VRAM manager
|
||||
└── test_main_integration.py # Tests de integracion
|
||||
```
|
||||
|
||||
## Instalacion de Dependencias de Test
|
||||
|
||||
```bash
|
||||
# Activar entorno virtual
|
||||
source .venv/bin/activate
|
||||
|
||||
# Instalar dependencias de desarrollo
|
||||
pip install -r requirements-dev.txt
|
||||
|
||||
# Verificar instalacion
|
||||
pytest --version
|
||||
```
|
||||
|
||||
## Ejecutar Tests
|
||||
|
||||
### Todos los Tests
|
||||
|
||||
```bash
|
||||
# Ejecutar todos los tests
|
||||
pytest tests/
|
||||
|
||||
# Con output detallado
|
||||
pytest tests/ -v
|
||||
```
|
||||
|
||||
### Tests Especificos
|
||||
|
||||
```bash
|
||||
# Tests de configuracion
|
||||
pytest tests/test_config.py -v
|
||||
|
||||
# Tests de almacenamiento
|
||||
pytest tests/test_storage.py -v
|
||||
|
||||
# Tests de WebDAV
|
||||
pytest tests/test_webdav.py -v
|
||||
|
||||
# Tests de procesadores
|
||||
pytest tests/test_processors.py -v
|
||||
|
||||
# Tests de AI providers
|
||||
pytest tests/test_ai_providers.py -v
|
||||
|
||||
# Tests de VRAM manager
|
||||
pytest tests/test_vram_manager.py -v
|
||||
|
||||
# Tests de integracion
|
||||
pytest tests/test_main_integration.py -v
|
||||
```
|
||||
|
||||
### Tests con Coverage
|
||||
|
||||
```bash
|
||||
# Coverage basico
|
||||
pytest tests/ --cov=cbcfacil
|
||||
|
||||
# Coverage con reporte HTML
|
||||
pytest tests/ --cov=cbcfacil --cov-report=html
|
||||
|
||||
# Coverage con reporte term-missing
|
||||
pytest tests/ --cov=cbcfacil --cov-report=term-missing
|
||||
|
||||
# Coverage por modulo
|
||||
pytest tests/ --cov=cbcfacil --cov-report=term-missing --cov-report=annotate
|
||||
```
|
||||
|
||||
### Tests en Modo Watch
|
||||
|
||||
```bash
|
||||
# Recargar automaticamente al detectar cambios
|
||||
pytest-watch tests/
|
||||
```
|
||||
|
||||
### Tests Parallelos
|
||||
|
||||
```bash
|
||||
# Ejecutar tests en paralelo
|
||||
pytest tests/ -n auto
|
||||
|
||||
# Con numero fijo de workers
|
||||
pytest tests/ -n 4
|
||||
```
|
||||
|
||||
## Escribir Nuevos Tests
|
||||
|
||||
### Estructura Basica
|
||||
|
||||
```python
|
||||
# tests/test_ejemplo.py
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
class TestEjemplo:
|
||||
"""Clase de tests para un modulo"""
|
||||
|
||||
def setup_method(self):
|
||||
"""Setup antes de cada test"""
|
||||
pass
|
||||
|
||||
def teardown_method(self):
|
||||
"""Cleanup despues de cada test"""
|
||||
pass
|
||||
|
||||
def test_funcion_basica(self):
|
||||
"""Test de una funcion basica"""
|
||||
# Arrange
|
||||
input_value = "test"
|
||||
|
||||
# Act
|
||||
result = mi_funcion(input_value)
|
||||
|
||||
# Assert
|
||||
assert result is not None
|
||||
assert result == "expected"
|
||||
```
|
||||
|
||||
### Usar Fixtures
|
||||
|
||||
```python
|
||||
# tests/conftest.py
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
@pytest.fixture
|
||||
def temp_directory(tmp_path):
|
||||
"""Fixture para directorio temporal"""
|
||||
dir_path = tmp_path / "test_files"
|
||||
dir_path.mkdir()
|
||||
return dir_path
|
||||
|
||||
@pytest.fixture
|
||||
def mock_settings():
|
||||
"""Fixture con settings de prueba"""
|
||||
class MockSettings:
|
||||
NEXTCLOUD_URL = "https://test.example.com"
|
||||
NEXTCLOUD_USER = "test_user"
|
||||
NEXTCLOUD_PASSWORD = "test_pass"
|
||||
return MockSettings()
|
||||
|
||||
# En tu test
|
||||
def test_con_fixture(temp_directory, mock_settings):
|
||||
"""Test usando fixtures"""
|
||||
assert temp_directory.exists()
|
||||
assert mock_settings.NEXTCLOUD_URL == "https://test.example.com"
|
||||
```
|
||||
|
||||
### Tests de Configuracion
|
||||
|
||||
```python
|
||||
# tests/test_config.py
|
||||
import pytest
|
||||
from config import settings
|
||||
|
||||
class TestSettings:
|
||||
"""Tests para configuracion"""
|
||||
|
||||
def test_has_webdav_config_true(self):
|
||||
"""Test con WebDAV configurado"""
|
||||
# Verificar que las properties funcionan
|
||||
assert hasattr(settings, 'has_webdav_config')
|
||||
assert hasattr(settings, 'has_ai_config')
|
||||
|
||||
def test_processed_files_path(self):
|
||||
"""Test del path de archivos procesados"""
|
||||
path = settings.processed_files_path
|
||||
assert isinstance(path, Path)
|
||||
assert path.suffix == ".txt"
|
||||
```
|
||||
|
||||
### Tests de WebDAV
|
||||
|
||||
```python
|
||||
# tests/test_webdav.py
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
class TestWebDAVService:
|
||||
"""Tests para WebDAV Service"""
|
||||
|
||||
@pytest.fixture
|
||||
def webdav_service(self):
|
||||
"""Crear instancia del servicio"""
|
||||
from services.webdav_service import webdav_service
|
||||
return webdav_service
|
||||
|
||||
def test_list_remote_path(self, webdav_service):
|
||||
"""Test de listado de archivos remotos"""
|
||||
# Mock del cliente WebDAV
|
||||
with patch('services.webdav_service.WebDAVClient') as mock_client:
|
||||
mock_instance = Mock()
|
||||
mock_instance.list.return_value = ['file1.pdf', 'file2.mp3']
|
||||
mock_client.return_value = mock_instance
|
||||
|
||||
# Inicializar servicio
|
||||
webdav_service.initialize()
|
||||
|
||||
# Test
|
||||
files = webdav_service.list("TestFolder")
|
||||
assert len(files) == 2
|
||||
assert "file1.pdf" in files
|
||||
```
|
||||
|
||||
### Tests de Procesadores
|
||||
|
||||
```python
|
||||
# tests/test_processors.py
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
class TestAudioProcessor:
|
||||
"""Tests para Audio Processor"""
|
||||
|
||||
@pytest.fixture
|
||||
def processor(self):
|
||||
"""Crear procesador"""
|
||||
from processors.audio_processor import AudioProcessor
|
||||
return AudioProcessor()
|
||||
|
||||
def test_process_audio_file(self, processor, tmp_path):
|
||||
"""Test de procesamiento de audio"""
|
||||
# Crear archivo de prueba
|
||||
audio_file = tmp_path / "test.mp3"
|
||||
audio_file.write_bytes(b"fake audio content")
|
||||
|
||||
# Mock de Whisper
|
||||
with patch('processors.audio_processor.whisper') as mock_whisper:
|
||||
mock_whisper.load_model.return_value.transcribe.return_value = {
|
||||
"text": "Texto transcrito de prueba"
|
||||
}
|
||||
|
||||
# Ejecutar
|
||||
result = processor.process(str(audio_file))
|
||||
|
||||
# Verificar
|
||||
assert result is not None
|
||||
```
|
||||
|
||||
### Tests de AI Providers
|
||||
|
||||
```python
|
||||
# tests/test_ai_providers.py
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
class TestClaudeProvider:
|
||||
"""Tests para Claude Provider"""
|
||||
|
||||
def test_summarize_text(self):
|
||||
"""Test de resumen con Claude"""
|
||||
from services.ai.claude_provider import ClaudeProvider
|
||||
|
||||
provider = ClaudeProvider()
|
||||
test_text = "Texto largo para resumir..."
|
||||
|
||||
# Mock de la llamada API
|
||||
with patch.object(provider, '_call_api') as mock_call:
|
||||
mock_call.return_value = "Texto resumido"
|
||||
|
||||
result = provider.summarize(test_text)
|
||||
|
||||
assert result == "Texto resumido"
|
||||
mock_call.assert_called_once()
|
||||
```
|
||||
|
||||
### Tests de Integracion
|
||||
|
||||
```python
|
||||
# tests/test_main_integration.py
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
class TestMainIntegration:
|
||||
"""Tests de integracion del main"""
|
||||
|
||||
def test_main_loop_no_files(self):
|
||||
"""Test del loop principal sin archivos nuevos"""
|
||||
with patch('main.webdav_service') as mock_webdav:
|
||||
with patch('main.processed_registry') as mock_registry:
|
||||
mock_webdav.list.return_value = []
|
||||
mock_registry.is_processed.return_value = True
|
||||
|
||||
# El loop no debe procesar nada
|
||||
# Verificar que no se llama a procesadores
|
||||
```
|
||||
|
||||
## Configuracion de pytest
|
||||
|
||||
```ini
|
||||
# pytest.ini o pyproject.toml
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
python_files = ["test_*.py"]
|
||||
python_classes = ["Test*"]
|
||||
python_functions = ["test_*"]
|
||||
addopts = [
|
||||
"-v",
|
||||
"--tb=short",
|
||||
"--strict-markers",
|
||||
]
|
||||
filterwarnings = [
|
||||
"ignore::DeprecationWarning",
|
||||
]
|
||||
```
|
||||
|
||||
## Mejores Practicas
|
||||
|
||||
### 1. Nombrado de Tests
|
||||
|
||||
```python
|
||||
# BIEN
|
||||
def test_webdav_service_list_returns_files():
|
||||
...
|
||||
|
||||
def test_processed_registry_is_processed_true_for_processed_file():
|
||||
...
|
||||
|
||||
# MAL
|
||||
def test_list():
|
||||
...
|
||||
|
||||
def test_check():
|
||||
...
|
||||
```
|
||||
|
||||
### 2. Estructura AAA
|
||||
|
||||
```python
|
||||
def test_ejemplo_aaa():
|
||||
# Arrange
|
||||
input_data = {"key": "value"}
|
||||
expected = "result"
|
||||
|
||||
# Act
|
||||
actual = function_under_test(input_data)
|
||||
|
||||
# Assert
|
||||
assert actual == expected
|
||||
```
|
||||
|
||||
### 3. Tests Aislados
|
||||
|
||||
```python
|
||||
# Cada test debe ser independiente
|
||||
def test_independent():
|
||||
# No depender de estado de otros tests
|
||||
# Usar fixtures para setup/cleanup
|
||||
pass
|
||||
```
|
||||
|
||||
### 4. Evitar TestLego
|
||||
|
||||
```python
|
||||
# BIEN - Test del comportamiento, no la implementacion
|
||||
def test_registry_returns_true_for_processed_file():
|
||||
registry = ProcessedRegistry()
|
||||
registry.save("file.txt")
|
||||
assert registry.is_processed("file.txt") is True
|
||||
|
||||
# MAL - Test de implementacion
|
||||
def test_registry_uses_set_internally():
|
||||
# No testar detalles de implementacion
|
||||
registry = ProcessedRegistry()
|
||||
assert hasattr(registry, '_processed_files')
|
||||
```
|
||||
|
||||
### 5. Mocks Appropriados
|
||||
|
||||
```python
|
||||
# Usar mocks para dependencias externas
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
|
||||
def test_with_mocked_api():
|
||||
with patch('requests.get') as mock_get:
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = {"key": "value"}
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = my_api_function()
|
||||
|
||||
assert result == {"key": "value"}
|
||||
```
|
||||
|
||||
## Coverage Objetivo
|
||||
|
||||
| Componente | Coverage Minimo |
|
||||
|------------|-----------------|
|
||||
| config/ | 90% |
|
||||
| core/ | 90% |
|
||||
| services/ | 70% |
|
||||
| processors/ | 60% |
|
||||
| storage/ | 90% |
|
||||
| api/ | 80% |
|
||||
|
||||
## Integracion con CI/CD
|
||||
|
||||
```yaml
|
||||
# .github/workflows/tests.yml
|
||||
name: Tests
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
pip install -r requirements-dev.txt
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
pytest tests/ --cov=cbcfacil --cov-report=xml
|
||||
|
||||
- name: Upload coverage
|
||||
uses: codecov/codecov-action@v3
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Tests Fallan por Imports
|
||||
|
||||
```bash
|
||||
# Verificar que el venv esta activado
|
||||
source .venv/bin/activate
|
||||
|
||||
# Reinstalar el paquete en modo desarrollo
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
### Tests Muy Lentos
|
||||
|
||||
```bash
|
||||
# Ejecutar en paralelo
|
||||
pytest tests/ -n auto
|
||||
|
||||
# O ejecutar solo tests rapidos
|
||||
pytest tests/ -m "not slow"
|
||||
```
|
||||
|
||||
### Memory Errors
|
||||
|
||||
```bash
|
||||
# Reducir workers
|
||||
pytest tests/ -n 2
|
||||
|
||||
# O ejecutar secuencial
|
||||
pytest tests/ -n 0
|
||||
```
|
||||
|
||||
## Recursos Adicionales
|
||||
|
||||
- [Documentacion de pytest](https://docs.pytest.org/)
|
||||
- [Documentacion de unittest.mock](https://docs.python.org/3/library/unittest.mock.html)
|
||||
- [pytest-cov](https://pytest-cov.readthedocs.io/)
|
||||
7
document/__init__.py
Normal file
7
document/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
Document generation package for CBCFacil
|
||||
"""
|
||||
|
||||
from .generators import DocumentGenerator
|
||||
|
||||
__all__ = ['DocumentGenerator']
|
||||
164
document/generators.py
Normal file
164
document/generators.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
Document generation utilities
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List, Tuple
|
||||
from ..core import FileProcessingError
|
||||
from ..config import settings
|
||||
from ..services.ai import ai_provider_factory
|
||||
|
||||
|
||||
class DocumentGenerator:
|
||||
"""Generate documents from processed text"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.ai_provider = ai_provider_factory.get_best_provider()
|
||||
|
||||
def generate_summary(self, text: str, base_name: str) -> Tuple[bool, str, Dict[str, Any]]:
|
||||
"""Generate unified summary"""
|
||||
self.logger.info(f"Generating summary for {base_name}")
|
||||
|
||||
try:
|
||||
# Generate summary
|
||||
summary = self.ai_provider.summarize(text)
|
||||
|
||||
# Generate filename
|
||||
filename = self._generate_filename(text, summary)
|
||||
|
||||
# Create document
|
||||
markdown_path = self._create_markdown(summary, base_name)
|
||||
docx_path = self._create_docx(summary, base_name)
|
||||
pdf_path = self._create_pdf(summary, base_name)
|
||||
|
||||
metadata = {
|
||||
'markdown_path': str(markdown_path),
|
||||
'docx_path': str(docx_path),
|
||||
'pdf_path': str(pdf_path),
|
||||
'docx_name': Path(docx_path).name,
|
||||
'summary': summary,
|
||||
'filename': filename
|
||||
}
|
||||
|
||||
return True, summary, metadata
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Summary generation failed: {e}")
|
||||
return False, "", {}
|
||||
|
||||
def _generate_filename(self, text: str, summary: str) -> str:
|
||||
"""Generate intelligent filename"""
|
||||
try:
|
||||
# Use AI to extract key topics
|
||||
prompt = f"""Extract 2-3 key topics from this summary to create a filename.
|
||||
Summary: {summary}
|
||||
|
||||
Return only the topics separated by hyphens, max 20 chars each, in Spanish:"""
|
||||
|
||||
topics_text = self.ai_provider.sanitize_input(prompt) if hasattr(self.ai_provider, 'sanitize_input') else summary[:100]
|
||||
|
||||
# Simple topic extraction
|
||||
topics = re.findall(r'\b[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+\b', topics_text)[:3]
|
||||
if not topics:
|
||||
topics = ['documento']
|
||||
|
||||
# Limit topic length
|
||||
topics = [t[:settings.MAX_FILENAME_TOPICS_LENGTH] for t in topics]
|
||||
|
||||
filename = '_'.join(topics)[:settings.MAX_FILENAME_LENGTH]
|
||||
return filename
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Filename generation failed: {e}")
|
||||
return base_name[:settings.MAX_FILENAME_BASE_LENGTH]
|
||||
|
||||
def _create_markdown(self, summary: str, base_name: str) -> Path:
|
||||
"""Create Markdown document"""
|
||||
output_dir = settings.LOCAL_DOWNLOADS_PATH
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_path = output_dir / f"{base_name}_unificado.md"
|
||||
|
||||
content = f"""# {base_name.replace('_', ' ').title()}
|
||||
|
||||
## Resumen
|
||||
|
||||
{summary}
|
||||
|
||||
---
|
||||
|
||||
*Generado por CBCFacil*
|
||||
"""
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
return output_path
|
||||
|
||||
def _create_docx(self, summary: str, base_name: str) -> Path:
|
||||
"""Create DOCX document"""
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
except ImportError:
|
||||
raise FileProcessingError("python-docx not installed")
|
||||
|
||||
output_dir = settings.LOCAL_DOCX
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_path = output_dir / f"{base_name}_unificado.docx"
|
||||
|
||||
doc = Document()
|
||||
doc.add_heading(base_name.replace('_', ' ').title(), 0)
|
||||
|
||||
doc.add_heading('Resumen', level=1)
|
||||
doc.add_paragraph(summary)
|
||||
|
||||
doc.add_page_break()
|
||||
doc.add_paragraph(f"*Generado por CBCFacil*")
|
||||
|
||||
doc.save(output_path)
|
||||
return output_path
|
||||
|
||||
def _create_pdf(self, summary: str, base_name: str) -> Path:
|
||||
"""Create PDF document"""
|
||||
try:
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.pdfgen import canvas
|
||||
except ImportError:
|
||||
raise FileProcessingError("reportlab not installed")
|
||||
|
||||
output_dir = settings.LOCAL_DOWNLOADS_PATH
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_path = output_dir / f"{base_name}_unificado.pdf"
|
||||
|
||||
c = canvas.Canvas(str(output_path), pagesize=letter)
|
||||
width, height = letter
|
||||
|
||||
# Add title
|
||||
c.setFont("Helvetica-Bold", 16)
|
||||
title = base_name.replace('_', ' ').title()
|
||||
c.drawString(100, height - 100, title)
|
||||
|
||||
# Add summary
|
||||
c.setFont("Helvetica", 12)
|
||||
y_position = height - 140
|
||||
|
||||
# Simple text wrapping
|
||||
lines = summary.split('\n')
|
||||
for line in lines:
|
||||
if y_position < 100:
|
||||
c.showPage()
|
||||
y_position = height - 100
|
||||
c.setFont("Helvetica", 12)
|
||||
|
||||
c.drawString(100, y_position, line)
|
||||
y_position -= 20
|
||||
|
||||
c.showPage()
|
||||
c.save()
|
||||
|
||||
return output_path
|
||||
148
main.py.backup
Normal file
148
main.py.backup
Normal file
@@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
CBCFacil - Main Service Entry Point
|
||||
Unified AI service for document processing (audio, PDF, text)
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
import fcntl
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def acquire_lock() -> int:
|
||||
"""Acquire single instance lock"""
|
||||
lock_file = Path(os.getenv("LOCAL_STATE_DIR", str(Path(__file__).parent))) / ".main_service.lock"
|
||||
lock_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
lock_fd = open(lock_file, 'w')
|
||||
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
lock_fd.write(str(os.getpid()))
|
||||
lock_fd.flush()
|
||||
logger.info(f"Lock acquired. PID: {os.getpid()}")
|
||||
return lock_fd
|
||||
|
||||
|
||||
def release_lock(lock_fd) -> None:
|
||||
"""Release lock"""
|
||||
try:
|
||||
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_UN)
|
||||
lock_fd.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not release lock: {e}")
|
||||
|
||||
|
||||
def initialize_services() -> None:
|
||||
"""Initialize all services"""
|
||||
from config import settings
|
||||
from services.webdav_service import webdav_service
|
||||
from services.vram_manager import vram_manager
|
||||
from services.telegram_service import telegram_service
|
||||
from storage.processed_registry import processed_registry
|
||||
|
||||
# Configure Telegram if credentials available
|
||||
if settings.TELEGRAM_TOKEN and settings.TELEGRAM_CHAT_ID:
|
||||
telegram_service.configure(settings.TELEGRAM_TOKEN, settings.TELEGRAM_CHAT_ID)
|
||||
telegram_service.send_start_notification()
|
||||
|
||||
# Initialize WebDAV if configured
|
||||
if settings.has_webdav_config:
|
||||
webdav_service.initialize()
|
||||
|
||||
# Initialize VRAM manager
|
||||
vram_manager.initialize()
|
||||
|
||||
# Initialize processed registry
|
||||
processed_registry.initialize()
|
||||
|
||||
logger.info("All services initialized")
|
||||
|
||||
|
||||
def run_main_loop() -> None:
|
||||
"""Main processing loop"""
|
||||
from config import settings
|
||||
from services.webdav_service import webdav_service
|
||||
from storage.processed_registry import processed_registry
|
||||
from processors.audio_processor import AudioProcessor
|
||||
from processors.pdf_processor import PDFProcessor
|
||||
from processors.text_processor import TextProcessor
|
||||
|
||||
audio_processor = AudioProcessor()
|
||||
pdf_processor = PDFProcessor()
|
||||
text_processor = TextProcessor()
|
||||
|
||||
while True:
|
||||
try:
|
||||
logger.info("--- Polling for new files ---")
|
||||
processed_registry.load()
|
||||
|
||||
# Process PDFs
|
||||
if settings.has_webdav_config:
|
||||
webdav_service.mkdir(settings.REMOTE_PDF_FOLDER)
|
||||
pdf_files = webdav_service.list(settings.REMOTE_PDF_FOLDER)
|
||||
for file_path in pdf_files:
|
||||
if file_path.lower().endswith('.pdf'):
|
||||
if not processed_registry.is_processed(file_path):
|
||||
pdf_processor.process(file_path)
|
||||
processed_registry.save(file_path)
|
||||
|
||||
# Process Audio files
|
||||
if settings.has_webdav_config:
|
||||
audio_files = webdav_service.list(settings.REMOTE_AUDIOS_FOLDER)
|
||||
for file_path in audio_files:
|
||||
if any(file_path.lower().endswith(ext) for ext in settings.AUDIO_EXTENSIONS):
|
||||
if not processed_registry.is_processed(file_path):
|
||||
audio_processor.process(file_path)
|
||||
processed_registry.save(file_path)
|
||||
|
||||
# Process Text files
|
||||
if settings.has_webdav_config:
|
||||
text_files = webdav_service.list(settings.REMOTE_TXT_FOLDER)
|
||||
for file_path in text_files:
|
||||
if any(file_path.lower().endswith(ext) for ext in settings.TXT_EXTENSIONS):
|
||||
if not processed_registry.is_processed(file_path):
|
||||
text_processor.process(file_path)
|
||||
processed_registry.save(file_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in main loop: {e}")
|
||||
|
||||
logger.info(f"Cycle completed. Waiting {settings.POLL_INTERVAL} seconds...")
|
||||
time.sleep(settings.POLL_INTERVAL)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
lock_fd = acquire_lock()
|
||||
try:
|
||||
logger.info("=== CBCFacil Service Started ===")
|
||||
initialize_services()
|
||||
run_main_loop()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Shutdown requested")
|
||||
finally:
|
||||
release_lock(lock_fd)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Handle CLI commands
|
||||
if len(sys.argv) > 1:
|
||||
command = sys.argv[1]
|
||||
if command == "whisper" and len(sys.argv) == 4:
|
||||
from processors.audio_processor import AudioProcessor
|
||||
AudioProcessor().process(sys.argv[2])
|
||||
elif command == "pdf" and len(sys.argv) == 4:
|
||||
from processors.pdf_processor import PDFProcessor
|
||||
PDFProcessor().process(sys.argv[2])
|
||||
else:
|
||||
print("Usage: python main.py [whisper|pdf]")
|
||||
sys.exit(1)
|
||||
else:
|
||||
main()
|
||||
15
processors/__init__.py
Normal file
15
processors/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
Processors package for CBCFacil
|
||||
"""
|
||||
|
||||
from .base_processor import FileProcessor
|
||||
from .audio_processor import AudioProcessor
|
||||
from .pdf_processor import PDFProcessor
|
||||
from .text_processor import TextProcessor
|
||||
|
||||
__all__ = [
|
||||
'FileProcessor',
|
||||
'AudioProcessor',
|
||||
'PDFProcessor',
|
||||
'TextProcessor'
|
||||
]
|
||||
93
processors/audio_processor.py
Normal file
93
processors/audio_processor.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""
|
||||
Audio file processor using Whisper
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from ..core import FileProcessingError
|
||||
from ..config import settings
|
||||
from ..services import vram_manager
|
||||
from ..services.gpu_detector import gpu_detector
|
||||
from .base_processor import FileProcessor
|
||||
|
||||
try:
|
||||
import whisper
|
||||
import torch
|
||||
WHISPER_AVAILABLE = True
|
||||
except ImportError:
|
||||
WHISPER_AVAILABLE = False
|
||||
|
||||
|
||||
class AudioProcessor(FileProcessor):
|
||||
"""Processor for audio files using Whisper"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("AudioProcessor")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._model = None
|
||||
self._model_name = "medium" # Optimized for Spanish
|
||||
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if file is an audio file"""
|
||||
ext = self.get_file_extension(file_path)
|
||||
return ext in settings.AUDIO_EXTENSIONS
|
||||
|
||||
def _load_model(self):
|
||||
"""Load Whisper model lazily"""
|
||||
if not WHISPER_AVAILABLE:
|
||||
raise FileProcessingError("Whisper not installed")
|
||||
|
||||
if self._model is None:
|
||||
device = gpu_detector.get_device()
|
||||
self.logger.info(f"Loading Whisper model: {self._model_name} on {device}")
|
||||
self._model = whisper.load_model(self._model_name, device=device)
|
||||
vram_manager.update_usage()
|
||||
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Transcribe audio file"""
|
||||
self.validate_file(file_path)
|
||||
audio_path = Path(file_path)
|
||||
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{audio_path.stem}.txt"
|
||||
|
||||
self.logger.info(f"Processing audio file: {audio_path}")
|
||||
|
||||
try:
|
||||
# Load model if needed
|
||||
self._load_model()
|
||||
|
||||
# Update VRAM usage
|
||||
vram_manager.update_usage()
|
||||
|
||||
# Transcribe with torch.no_grad() for memory efficiency
|
||||
with torch.inference_mode():
|
||||
result = self._model.transcribe(
|
||||
str(audio_path),
|
||||
language="es",
|
||||
fp16=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Save transcription
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(result["text"])
|
||||
|
||||
self.logger.info(f"Transcription completed: {output_path}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"transcription_path": str(output_path),
|
||||
"text": result["text"],
|
||||
"model_used": self._model_name
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Audio processing failed: {e}")
|
||||
raise FileProcessingError(f"Audio processing failed: {e}")
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup model"""
|
||||
if self._model is not None:
|
||||
del self._model
|
||||
self._model = None
|
||||
vram_manager.cleanup()
|
||||
40
processors/base_processor.py
Normal file
40
processors/base_processor.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""
|
||||
Base File Processor (Strategy Pattern)
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from ..core import FileProcessingError
|
||||
|
||||
|
||||
class FileProcessor(ABC):
|
||||
"""Abstract base class for file processors"""
|
||||
|
||||
def __init__(self, name: str):
|
||||
self.name = name
|
||||
|
||||
@abstractmethod
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if processor can handle this file type"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Process the file"""
|
||||
pass
|
||||
|
||||
def get_file_extension(self, file_path: str) -> str:
|
||||
"""Get file extension from path"""
|
||||
return Path(file_path).suffix.lower()
|
||||
|
||||
def get_base_name(self, file_path: str) -> str:
|
||||
"""Get base name without extension"""
|
||||
return Path(file_path).stem
|
||||
|
||||
def validate_file(self, file_path: str) -> None:
|
||||
"""Validate file exists and is accessible"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise FileProcessingError(f"File not found: {file_path}")
|
||||
if not path.is_file():
|
||||
raise FileProcessingError(f"Path is not a file: {file_path}")
|
||||
159
processors/pdf_processor.py
Normal file
159
processors/pdf_processor.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""
|
||||
PDF file processor with OCR
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from ..core import FileProcessingError
|
||||
from ..config import settings
|
||||
from ..services import vram_manager
|
||||
from ..services.gpu_detector import gpu_detector
|
||||
from .base_processor import FileProcessor
|
||||
|
||||
try:
|
||||
import torch
|
||||
import pytesseract
|
||||
import easyocr
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pdf2image import convert_from_path
|
||||
from PIL import Image
|
||||
PDF_OCR_AVAILABLE = True
|
||||
except ImportError:
|
||||
PDF_OCR_AVAILABLE = False
|
||||
|
||||
|
||||
class PDFProcessor(FileProcessor):
|
||||
"""Processor for PDF files with OCR"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("PDFProcessor")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._easyocr_reader = None
|
||||
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if file is a PDF"""
|
||||
return self.get_file_extension(file_path) == ".pdf"
|
||||
|
||||
def _load_easyocr(self):
|
||||
"""Load EasyOCR reader"""
|
||||
if self._easyocr_reader is None:
|
||||
use_gpu = gpu_detector.is_available()
|
||||
self.logger.info(f"Loading EasyOCR reader (GPU: {use_gpu})")
|
||||
self._easyocr_reader = easyocr.Reader(['es'], gpu=use_gpu)
|
||||
vram_manager.update_usage()
|
||||
|
||||
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
||||
"""Preprocess image for better OCR"""
|
||||
# Convert to grayscale
|
||||
if image.mode != 'L':
|
||||
image = image.convert('L')
|
||||
|
||||
# Simple preprocessing
|
||||
image = image.resize((image.width * 2, image.height * 2), Image.Resampling.LANCZOS)
|
||||
|
||||
return image
|
||||
|
||||
def _run_ocr_parallel(self, pil_images) -> Dict[str, list]:
|
||||
"""Run all OCR engines in parallel"""
|
||||
results = {
|
||||
'easyocr': [''] * len(pil_images),
|
||||
'tesseract': [''] * len(pil_images)
|
||||
}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2) as executor:
|
||||
futures = {}
|
||||
|
||||
# EasyOCR
|
||||
if self._easyocr_reader:
|
||||
futures['easyocr'] = executor.submit(
|
||||
self._easyocr_reader.readtext_batched,
|
||||
pil_images,
|
||||
detail=0
|
||||
)
|
||||
|
||||
# Tesseract
|
||||
futures['tesseract'] = executor.submit(
|
||||
lambda imgs: [pytesseract.image_to_string(img, lang='spa') for img in imgs],
|
||||
pil_images
|
||||
)
|
||||
|
||||
# Collect results
|
||||
for name, future in futures.items():
|
||||
try:
|
||||
results[name] = future.result()
|
||||
except Exception as e:
|
||||
self.logger.error(f"OCR engine {name} failed: {e}")
|
||||
results[name] = [''] * len(pil_images)
|
||||
|
||||
return results
|
||||
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Process PDF with OCR"""
|
||||
self.validate_file(file_path)
|
||||
pdf_path = Path(file_path)
|
||||
output_path = settings.LOCAL_DOWNLOADS_PATH / f"{pdf_path.stem}.txt"
|
||||
|
||||
if not PDF_OCR_AVAILABLE:
|
||||
raise FileProcessingError("PDF OCR dependencies not installed")
|
||||
|
||||
self.logger.info(f"Processing PDF file: {pdf_path}")
|
||||
|
||||
try:
|
||||
# Load EasyOCR if needed
|
||||
self._load_easyocr()
|
||||
vram_manager.update_usage()
|
||||
|
||||
# Convert PDF to images
|
||||
self.logger.debug("Converting PDF to images")
|
||||
pil_images = convert_from_path(
|
||||
str(pdf_path),
|
||||
dpi=settings.PDF_DPI,
|
||||
fmt='png',
|
||||
thread_count=settings.PDF_RENDER_THREAD_COUNT
|
||||
)
|
||||
|
||||
# Process in batches
|
||||
all_text = []
|
||||
batch_size = settings.PDF_BATCH_SIZE
|
||||
|
||||
for i in range(0, len(pil_images), batch_size):
|
||||
batch = pil_images[i:i + batch_size]
|
||||
self.logger.debug(f"Processing batch {i//batch_size + 1}/{(len(pil_images) + batch_size - 1)//batch_size}")
|
||||
|
||||
# Preprocess images
|
||||
preprocessed_batch = [self._preprocess_image(img) for img in batch]
|
||||
|
||||
# Run OCR in parallel
|
||||
ocr_results = self._run_ocr_parallel(preprocessed_batch)
|
||||
|
||||
# Combine results
|
||||
for j, img in enumerate(batch):
|
||||
# Take best result (simple approach: try EasyOCR first, then Tesseract)
|
||||
text = ocr_results['easyocr'][j] if ocr_results['easyocr'][j] else ocr_results['tesseract'][j]
|
||||
if text:
|
||||
all_text.append(text)
|
||||
|
||||
# Save combined text
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write("\n\n".join(all_text))
|
||||
|
||||
self.logger.info(f"PDF processing completed: {output_path}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text_path": str(output_path),
|
||||
"text": "\n\n".join(all_text),
|
||||
"pages_processed": len(pil_images)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"PDF processing failed: {e}")
|
||||
raise FileProcessingError(f"PDF processing failed: {e}")
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup OCR models"""
|
||||
self._easyocr_reader = None
|
||||
vram_manager.cleanup()
|
||||
55
processors/text_processor.py
Normal file
55
processors/text_processor.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""
|
||||
Text file processor
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from ..core import FileProcessingError
|
||||
from ..config import settings
|
||||
from .base_processor import FileProcessor
|
||||
|
||||
|
||||
class TextProcessor(FileProcessor):
|
||||
"""Processor for text files"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("TextProcessor")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def can_process(self, file_path: str) -> bool:
|
||||
"""Check if file is a text file"""
|
||||
ext = self.get_file_extension(file_path)
|
||||
return ext in settings.TXT_EXTENSIONS
|
||||
|
||||
def process(self, file_path: str) -> Dict[str, Any]:
|
||||
"""Process text file (copy to downloads)"""
|
||||
self.validate_file(file_path)
|
||||
text_path = Path(file_path)
|
||||
output_path = settings.LOCAL_DOWNLOADS_PATH / text_path.name
|
||||
|
||||
self.logger.info(f"Processing text file: {text_path}")
|
||||
|
||||
try:
|
||||
# Copy file to downloads directory
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(text_path, 'r', encoding='utf-8') as src:
|
||||
with open(output_path, 'w', encoding='utf-8') as dst:
|
||||
dst.write(src.read())
|
||||
|
||||
self.logger.info(f"Text file processing completed: {output_path}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text_path": str(output_path),
|
||||
"text": self._read_file(output_path)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Text processing failed: {e}")
|
||||
raise FileProcessingError(f"Text processing failed: {e}")
|
||||
|
||||
def _read_file(self, file_path: Path) -> str:
|
||||
"""Read file content"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
23
requirements-dev.txt
Normal file
23
requirements-dev.txt
Normal file
@@ -0,0 +1,23 @@
|
||||
# Development dependencies
|
||||
pytest>=7.4.0
|
||||
pytest-cov>=4.1.0
|
||||
pytest-mock>=3.11.0
|
||||
pytest-asyncio>=0.21.0
|
||||
coverage>=7.3.0
|
||||
|
||||
# Code quality
|
||||
black>=23.0.0
|
||||
flake8>=6.0.0
|
||||
mypy>=1.5.0
|
||||
isort>=5.12.0
|
||||
|
||||
# Security
|
||||
bandit>=1.7.5
|
||||
safety>=2.3.0
|
||||
|
||||
# Performance testing
|
||||
pytest-benchmark>=4.0.0
|
||||
|
||||
# Documentation
|
||||
mkdocs>=1.5.0
|
||||
mkdocs-material>=9.0.0
|
||||
@@ -1,17 +1,31 @@
|
||||
Flask
|
||||
Flask-CORS
|
||||
Pillow
|
||||
easyocr
|
||||
numpy
|
||||
openai-whisper
|
||||
opencv-python-headless
|
||||
pdf2image
|
||||
pypdf
|
||||
python-docx
|
||||
python-dotenv
|
||||
pytesseract
|
||||
reportlab
|
||||
requests
|
||||
torch
|
||||
transformers
|
||||
webdavclient3
|
||||
# Core web framework
|
||||
Flask>=3.0.0
|
||||
Flask-CORS>=4.0.0
|
||||
|
||||
# AI/ML dependencies
|
||||
torch>=2.0.0
|
||||
torchvision>=0.15.0
|
||||
openai-whisper>=20231117
|
||||
transformers>=4.30.0
|
||||
easyocr>=1.7.0
|
||||
|
||||
# Image processing
|
||||
Pillow>=10.0.0
|
||||
opencv-python-headless>=4.8.0
|
||||
|
||||
# Document processing
|
||||
pdf2image>=1.17.0
|
||||
pypdf>=3.17.0
|
||||
python-docx>=0.8.11
|
||||
reportlab>=4.0.0
|
||||
pytesseract>=0.3.10
|
||||
|
||||
# Utilities
|
||||
numpy>=1.24.0
|
||||
requests>=2.31.0
|
||||
python-dotenv>=1.0.0
|
||||
webdavclient3>=0.9.8
|
||||
|
||||
# Optional: for enhanced functionality
|
||||
# unidecode>=1.3.7 # For filename normalization
|
||||
# python-magic>=0.4.27 # For file type detection
|
||||
|
||||
17
services/__init__.py
Normal file
17
services/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
"""
|
||||
Services package for CBCFacil
|
||||
"""
|
||||
from .webdav_service import WebDAVService, webdav_service
|
||||
from .vram_manager import VRAMManager, vram_manager
|
||||
from .telegram_service import TelegramService, telegram_service
|
||||
from .gpu_detector import GPUDetector, GPUType, gpu_detector
|
||||
from .ai import ai_service
|
||||
|
||||
__all__ = [
|
||||
'WebDAVService', 'webdav_service',
|
||||
'VRAMManager', 'vram_manager',
|
||||
'TelegramService', 'telegram_service',
|
||||
'GPUDetector', 'GPUType', 'gpu_detector',
|
||||
'ai_service'
|
||||
]
|
||||
|
||||
15
services/ai/__init__.py
Normal file
15
services/ai/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
AI Providers package for CBCFacil
|
||||
"""
|
||||
|
||||
from .base_provider import AIProvider
|
||||
from .claude_provider import ClaudeProvider
|
||||
from .gemini_provider import GeminiProvider
|
||||
from .provider_factory import AIProviderFactory
|
||||
|
||||
__all__ = [
|
||||
'AIProvider',
|
||||
'ClaudeProvider',
|
||||
'GeminiProvider',
|
||||
'AIProviderFactory'
|
||||
]
|
||||
35
services/ai/base_provider.py
Normal file
35
services/ai/base_provider.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""
|
||||
Base AI Provider interface (Strategy pattern)
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
|
||||
class AIProvider(ABC):
|
||||
"""Abstract base class for AI providers"""
|
||||
|
||||
@abstractmethod
|
||||
def summarize(self, text: str, **kwargs) -> str:
|
||||
"""Generate summary of text"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def correct_text(self, text: str, **kwargs) -> str:
|
||||
"""Correct grammar and spelling in text"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def classify_content(self, text: str, **kwargs) -> Dict[str, Any]:
|
||||
"""Classify content into categories"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_available(self) -> bool:
|
||||
"""Check if provider is available and configured"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def name(self) -> str:
|
||||
"""Provider name"""
|
||||
pass
|
||||
108
services/ai/claude_provider.py
Normal file
108
services/ai/claude_provider.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""
|
||||
Claude AI Provider implementation
|
||||
"""
|
||||
import logging
|
||||
import subprocess
|
||||
import shutil
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from ..config import settings
|
||||
from ..core import AIProcessingError
|
||||
from .base_provider import AIProvider
|
||||
|
||||
|
||||
class ClaudeProvider(AIProvider):
|
||||
"""Claude AI provider using CLI"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._cli_path = settings.CLAUDE_CLI_PATH or shutil.which("claude")
|
||||
self._token = settings.ZAI_AUTH_TOKEN
|
||||
self._base_url = settings.ZAI_BASE_URL
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "Claude"
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if Claude CLI is available"""
|
||||
return bool(self._cli_path and self._token)
|
||||
|
||||
def _get_env(self) -> Dict[str, str]:
|
||||
"""Get environment variables for Claude"""
|
||||
env = {
|
||||
'ANTHROPIC_AUTH_TOKEN': self._token,
|
||||
'ANTHROPIC_BASE_URL': self._base_url,
|
||||
'PYTHONUNBUFFERED': '1'
|
||||
}
|
||||
return env
|
||||
|
||||
def _run_cli(self, prompt: str, timeout: int = 300) -> str:
|
||||
"""Run Claude CLI with prompt"""
|
||||
if not self.is_available():
|
||||
raise AIProcessingError("Claude CLI not available or not configured")
|
||||
|
||||
try:
|
||||
cmd = [self._cli_path]
|
||||
process = subprocess.run(
|
||||
cmd,
|
||||
input=prompt,
|
||||
env=self._get_env(),
|
||||
text=True,
|
||||
capture_output=True,
|
||||
timeout=timeout,
|
||||
shell=False
|
||||
)
|
||||
|
||||
if process.returncode != 0:
|
||||
error_msg = process.stderr or "Unknown error"
|
||||
raise AIProcessingError(f"Claude CLI failed: {error_msg}")
|
||||
|
||||
return process.stdout.strip()
|
||||
except subprocess.TimeoutExpired:
|
||||
raise AIProcessingError(f"Claude CLI timed out after {timeout}s")
|
||||
except Exception as e:
|
||||
raise AIProcessingError(f"Claude CLI error: {e}")
|
||||
|
||||
def summarize(self, text: str, **kwargs) -> str:
|
||||
"""Generate summary using Claude"""
|
||||
prompt = f"""Summarize the following text:
|
||||
|
||||
{text}
|
||||
|
||||
Provide a clear, concise summary in Spanish."""
|
||||
return self._run_cli(prompt)
|
||||
|
||||
def correct_text(self, text: str, **kwargs) -> str:
|
||||
"""Correct text using Claude"""
|
||||
prompt = f"""Correct the following text for grammar, spelling, and clarity:
|
||||
|
||||
{text}
|
||||
|
||||
Return only the corrected text, nothing else."""
|
||||
return self._run_cli(prompt)
|
||||
|
||||
def classify_content(self, text: str, **kwargs) -> Dict[str, Any]:
|
||||
"""Classify content using Claude"""
|
||||
categories = ["historia", "analisis_contable", "instituciones_gobierno", "otras_clases"]
|
||||
|
||||
prompt = f"""Classify the following text into one of these categories:
|
||||
- historia
|
||||
- analisis_contable
|
||||
- instituciones_gobierno
|
||||
- otras_clases
|
||||
|
||||
Text: {text}
|
||||
|
||||
Return only the category name, nothing else."""
|
||||
result = self._run_cli(prompt).lower()
|
||||
|
||||
# Validate result
|
||||
if result not in categories:
|
||||
result = "otras_clases"
|
||||
|
||||
return {
|
||||
"category": result,
|
||||
"confidence": 0.9,
|
||||
"provider": self.name
|
||||
}
|
||||
297
services/ai/gemini_provider.py
Normal file
297
services/ai/gemini_provider.py
Normal file
@@ -0,0 +1,297 @@
|
||||
"""
|
||||
Gemini AI Provider - Optimized version with rate limiting and retry
|
||||
"""
|
||||
import logging
|
||||
import subprocess
|
||||
import shutil
|
||||
import requests
|
||||
import time
|
||||
from typing import Dict, Any, Optional
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from ..config import settings
|
||||
from ..core import AIProcessingError
|
||||
from .base_provider import AIProvider
|
||||
|
||||
|
||||
class TokenBucket:
|
||||
"""Token bucket rate limiter"""
|
||||
|
||||
def __init__(self, rate: float = 10, capacity: int = 20):
|
||||
self.rate = rate # tokens per second
|
||||
self.capacity = capacity
|
||||
self.tokens = capacity
|
||||
self.last_update = time.time()
|
||||
self._lock = None # Lazy initialization
|
||||
|
||||
def _get_lock(self):
|
||||
if self._lock is None:
|
||||
import threading
|
||||
self._lock = threading.Lock()
|
||||
return self._lock
|
||||
|
||||
def acquire(self, tokens: int = 1) -> float:
|
||||
with self._get_lock():
|
||||
now = time.time()
|
||||
elapsed = now - self.last_update
|
||||
self.last_update = now
|
||||
self.tokens = min(self.capacity, self.tokens + elapsed * self.rate)
|
||||
|
||||
if self.tokens >= tokens:
|
||||
self.tokens -= tokens
|
||||
return 0.0
|
||||
|
||||
wait_time = (tokens - self.tokens) / self.rate
|
||||
self.tokens = 0
|
||||
return wait_time
|
||||
|
||||
|
||||
class CircuitBreaker:
|
||||
"""Circuit breaker for API calls"""
|
||||
|
||||
def __init__(self, failure_threshold: int = 5, recovery_timeout: int = 60):
|
||||
self.failure_threshold = failure_threshold
|
||||
self.recovery_timeout = recovery_timeout
|
||||
self.failures = 0
|
||||
self.last_failure: Optional[datetime] = None
|
||||
self.state = "closed" # closed, open, half-open
|
||||
self._lock = None
|
||||
|
||||
def _get_lock(self):
|
||||
if self._lock is None:
|
||||
import threading
|
||||
self._lock = threading.Lock()
|
||||
return self._lock
|
||||
|
||||
def call(self, func, *args, **kwargs):
|
||||
with self._get_lock():
|
||||
if self.state == "open":
|
||||
if self.last_failure and (datetime.utcnow() - self.last_failure).total_seconds() > self.recovery_timeout:
|
||||
self.state = "half-open"
|
||||
else:
|
||||
raise AIProcessingError("Circuit breaker is open")
|
||||
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
if self.state == "half-open":
|
||||
self.state = "closed"
|
||||
self.failures = 0
|
||||
return result
|
||||
except Exception as e:
|
||||
self.failures += 1
|
||||
self.last_failure = datetime.utcnow()
|
||||
if self.failures >= self.failure_threshold:
|
||||
self.state = "open"
|
||||
raise
|
||||
|
||||
|
||||
class GeminiProvider(AIProvider):
|
||||
"""Gemini AI provider with rate limiting and retry"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._cli_path = settings.GEMINI_CLI_PATH or shutil.which("gemini")
|
||||
self._api_key = settings.GEMINI_API_KEY
|
||||
self._flash_model = settings.GEMINI_FLASH_MODEL
|
||||
self._pro_model = settings.GEMINI_PRO_MODEL
|
||||
self._session = None
|
||||
self._rate_limiter = TokenBucket(rate=15, capacity=30)
|
||||
self._circuit_breaker = CircuitBreaker(failure_threshold=5, recovery_timeout=60)
|
||||
self._retry_config = {
|
||||
"max_attempts": 3,
|
||||
"base_delay": 1.0,
|
||||
"max_delay": 30.0,
|
||||
"exponential_base": 2
|
||||
}
|
||||
|
||||
def _init_session(self) -> None:
|
||||
"""Initialize HTTP session with connection pooling"""
|
||||
if self._session is None:
|
||||
self._session = requests.Session()
|
||||
adapter = requests.adapters.HTTPAdapter(
|
||||
pool_connections=10,
|
||||
pool_maxsize=20,
|
||||
max_retries=0 # We handle retries manually
|
||||
)
|
||||
self._session.mount('https://', adapter)
|
||||
|
||||
def _run_with_retry(self, func, *args, **kwargs):
|
||||
"""Execute function with exponential backoff retry"""
|
||||
max_attempts = self._retry_config["max_attempts"]
|
||||
base_delay = self._retry_config["base_delay"]
|
||||
|
||||
last_exception = None
|
||||
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
return self._circuit_breaker.call(func, *args, **kwargs)
|
||||
except requests.exceptions.RequestException as e:
|
||||
last_exception = e
|
||||
if attempt < max_attempts - 1:
|
||||
delay = min(
|
||||
base_delay * (2 ** attempt),
|
||||
self._retry_config["max_delay"]
|
||||
)
|
||||
# Add jitter
|
||||
delay += delay * 0.1 * (time.time() % 1)
|
||||
self.logger.warning(f"Attempt {attempt + 1} failed: {e}, retrying in {delay:.2f}s")
|
||||
time.sleep(delay)
|
||||
|
||||
raise AIProcessingError(f"Max retries exceeded: {last_exception}")
|
||||
|
||||
def _run_cli(self, prompt: str, use_flash: bool = True, timeout: int = 300) -> str:
|
||||
"""Run Gemini CLI with prompt"""
|
||||
if not self._cli_path:
|
||||
raise AIProcessingError("Gemini CLI not available")
|
||||
|
||||
model = self._flash_model if use_flash else self._pro_model
|
||||
cmd = [self._cli_path, model, prompt]
|
||||
|
||||
try:
|
||||
# Apply rate limiting
|
||||
wait_time = self._rate_limiter.acquire()
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
|
||||
process = subprocess.run(
|
||||
cmd,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
timeout=timeout,
|
||||
shell=False
|
||||
)
|
||||
|
||||
if process.returncode != 0:
|
||||
error_msg = process.stderr or "Unknown error"
|
||||
raise AIProcessingError(f"Gemini CLI failed: {error_msg}")
|
||||
|
||||
return process.stdout.strip()
|
||||
except subprocess.TimeoutExpired:
|
||||
raise AIProcessingError(f"Gemini CLI timed out after {timeout}s")
|
||||
except Exception as e:
|
||||
raise AIProcessingError(f"Gemini CLI error: {e}")
|
||||
|
||||
def _call_api(self, prompt: str, use_flash: bool = True, timeout: int = 180) -> str:
|
||||
"""Call Gemini API with rate limiting and retry"""
|
||||
if not self._api_key:
|
||||
raise AIProcessingError("Gemini API key not configured")
|
||||
|
||||
self._init_session()
|
||||
|
||||
model = self._flash_model if use_flash else self._pro_model
|
||||
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
|
||||
|
||||
payload = {
|
||||
"contents": [{
|
||||
"parts": [{"text": prompt}]
|
||||
}]
|
||||
}
|
||||
|
||||
params = {"key": self._api_key}
|
||||
|
||||
def api_call():
|
||||
# Apply rate limiting
|
||||
wait_time = self._rate_limiter.acquire()
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
|
||||
response = self._session.post(
|
||||
url,
|
||||
json=payload,
|
||||
params=params,
|
||||
timeout=timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
response = self._run_with_retry(api_call)
|
||||
data = response.json()
|
||||
|
||||
if "candidates" not in data or not data["candidates"]:
|
||||
raise AIProcessingError("Empty response from Gemini API")
|
||||
|
||||
candidate = data["candidates"][0]
|
||||
if "content" not in candidate or "parts" not in candidate["content"]:
|
||||
raise AIProcessingError("Invalid response format from Gemini API")
|
||||
|
||||
result = candidate["content"]["parts"][0]["text"]
|
||||
return result.strip()
|
||||
|
||||
def _run(self, prompt: str, use_flash: bool = True, timeout: int = 300) -> str:
|
||||
"""Run Gemini with fallback between CLI and API"""
|
||||
# Try CLI first if available
|
||||
if self._cli_path:
|
||||
try:
|
||||
return self._run_cli(prompt, use_flash, timeout)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Gemini CLI failed, trying API: {e}")
|
||||
|
||||
# Fallback to API
|
||||
if self._api_key:
|
||||
api_timeout = min(timeout, 180)
|
||||
return self._call_api(prompt, use_flash, api_timeout)
|
||||
|
||||
raise AIProcessingError("No Gemini provider available (CLI or API)")
|
||||
|
||||
def summarize(self, text: str, **kwargs) -> str:
|
||||
"""Generate summary using Gemini"""
|
||||
prompt = f"""Summarize the following text:
|
||||
|
||||
{text}
|
||||
|
||||
Provide a clear, concise summary in Spanish."""
|
||||
return self._run(prompt, use_flash=True)
|
||||
|
||||
def correct_text(self, text: str, **kwargs) -> str:
|
||||
"""Correct text using Gemini"""
|
||||
prompt = f"""Correct the following text for grammar, spelling, and clarity:
|
||||
|
||||
{text}
|
||||
|
||||
Return only the corrected text, nothing else."""
|
||||
return self._run(prompt, use_flash=True)
|
||||
|
||||
def classify_content(self, text: str, **kwargs) -> Dict[str, Any]:
|
||||
"""Classify content using Gemini"""
|
||||
categories = ["historia", "analisis_contable", "instituciones_gobierno", "otras_clases"]
|
||||
|
||||
prompt = f"""Classify the following text into one of these categories:
|
||||
- historia
|
||||
- analisis_contable
|
||||
- instituciones_gobierno
|
||||
- otras_clases
|
||||
|
||||
Text: {text}
|
||||
|
||||
Return only the category name, nothing else."""
|
||||
result = self._run(prompt, use_flash=True).lower()
|
||||
|
||||
# Validate result
|
||||
if result not in categories:
|
||||
result = "otras_clases"
|
||||
|
||||
return {
|
||||
"category": result,
|
||||
"confidence": 0.9,
|
||||
"provider": self.name
|
||||
}
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get provider statistics"""
|
||||
return {
|
||||
"rate_limiter": {
|
||||
"tokens": round(self._rate_limiter.tokens, 2),
|
||||
"capacity": self._rate_limiter.capacity,
|
||||
"rate": self._rate_limiter.rate
|
||||
},
|
||||
"circuit_breaker": {
|
||||
"state": self._circuit_breaker.state,
|
||||
"failures": self._circuit_breaker.failures,
|
||||
"failure_threshold": self._circuit_breaker.failure_threshold
|
||||
},
|
||||
"cli_available": bool(self._cli_path),
|
||||
"api_available": bool(self._api_key)
|
||||
}
|
||||
|
||||
|
||||
# Global instance is created in __init__.py
|
||||
54
services/ai/provider_factory.py
Normal file
54
services/ai/provider_factory.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""
|
||||
AI Provider Factory (Factory Pattern)
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Type
|
||||
|
||||
from ..core import AIProcessingError
|
||||
from .base_provider import AIProvider
|
||||
from .claude_provider import ClaudeProvider
|
||||
from .gemini_provider import GeminiProvider
|
||||
|
||||
|
||||
class AIProviderFactory:
|
||||
"""Factory for creating AI providers with fallback"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._providers: Dict[str, AIProvider] = {
|
||||
'claude': ClaudeProvider(),
|
||||
'gemini': GeminiProvider()
|
||||
}
|
||||
|
||||
def get_provider(self, preferred: str = 'gemini') -> AIProvider:
|
||||
"""Get available provider with fallback"""
|
||||
# Try preferred provider first
|
||||
if preferred in self._providers:
|
||||
provider = self._providers[preferred]
|
||||
if provider.is_available():
|
||||
self.logger.info(f"Using {preferred} provider")
|
||||
return provider
|
||||
|
||||
# Fallback to any available provider
|
||||
for name, provider in self._providers.items():
|
||||
if provider.is_available():
|
||||
self.logger.info(f"Falling back to {name} provider")
|
||||
return provider
|
||||
|
||||
raise AIProcessingError("No AI providers available")
|
||||
|
||||
def get_all_available(self) -> Dict[str, AIProvider]:
|
||||
"""Get all available providers"""
|
||||
return {
|
||||
name: provider
|
||||
for name, provider in self._providers.items()
|
||||
if provider.is_available()
|
||||
}
|
||||
|
||||
def get_best_provider(self) -> AIProvider:
|
||||
"""Get the best available provider (Gemini > Claude)"""
|
||||
return self.get_provider('gemini')
|
||||
|
||||
|
||||
# Global instance
|
||||
ai_provider_factory = AIProviderFactory()
|
||||
256
services/ai_service.py
Normal file
256
services/ai_service.py
Normal file
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
AI Service - Unified interface for AI providers with caching
|
||||
"""
|
||||
import logging
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Optional, Dict, Any
|
||||
from threading import Lock
|
||||
|
||||
from ..config import settings
|
||||
from ..core import AIProcessingError
|
||||
from .ai.provider_factory import AIProviderFactory, ai_provider_factory
|
||||
|
||||
|
||||
class LRUCache:
|
||||
"""Thread-safe LRU Cache implementation"""
|
||||
|
||||
def __init__(self, max_size: int = 100, ttl: int = 3600):
|
||||
self.max_size = max_size
|
||||
self.ttl = ttl
|
||||
self._cache: Dict[str, tuple[str, float]] = {}
|
||||
self._order: list[str] = []
|
||||
self._lock = Lock()
|
||||
|
||||
def _is_expired(self, timestamp: float) -> bool:
|
||||
return (time.time() - timestamp) > self.ttl
|
||||
|
||||
def get(self, key: str) -> Optional[str]:
|
||||
with self._lock:
|
||||
if key not in self._cache:
|
||||
return None
|
||||
value, timestamp = self._cache[key]
|
||||
if self._is_expired(timestamp):
|
||||
del self._cache[key]
|
||||
self._order.remove(key)
|
||||
return None
|
||||
# Move to end (most recently used)
|
||||
self._order.remove(key)
|
||||
self._order.append(key)
|
||||
return value
|
||||
|
||||
def set(self, key: str, value: str) -> None:
|
||||
with self._lock:
|
||||
if key in self._cache:
|
||||
self._order.remove(key)
|
||||
elif len(self._order) >= self.max_size:
|
||||
# Remove least recently used
|
||||
oldest = self._order.pop(0)
|
||||
del self._cache[oldest]
|
||||
self._cache[key] = (value, time.time())
|
||||
self._order.append(key)
|
||||
|
||||
def stats(self) -> Dict[str, int]:
|
||||
with self._lock:
|
||||
return {
|
||||
"size": len(self._cache),
|
||||
"max_size": self.max_size,
|
||||
"hits": sum(1 for _, t in self._cache.values() if not self._is_expired(t))
|
||||
}
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Token bucket rate limiter"""
|
||||
|
||||
def __init__(self, rate: float = 10, capacity: int = 20):
|
||||
self.rate = rate # tokens per second
|
||||
self.capacity = capacity
|
||||
self.tokens = capacity
|
||||
self.last_update = time.time()
|
||||
self._lock = Lock()
|
||||
|
||||
def acquire(self, tokens: int = 1) -> float:
|
||||
with self._lock:
|
||||
now = time.time()
|
||||
elapsed = now - self.last_update
|
||||
self.last_update = now
|
||||
self.tokens = min(self.capacity, self.tokens + elapsed * self.rate)
|
||||
|
||||
if self.tokens >= tokens:
|
||||
self.tokens -= tokens
|
||||
return 0.0
|
||||
|
||||
wait_time = (tokens - self.tokens) / self.rate
|
||||
self.tokens = 0
|
||||
return wait_time
|
||||
|
||||
|
||||
class AIService:
|
||||
"""Unified service for AI operations with caching and rate limiting"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._factory: Optional[AIProviderFactory] = None
|
||||
self._prompt_cache = LRUCache(max_size=100, ttl=3600) # 1 hour TTL
|
||||
self._rate_limiter = RateLimiter(rate=15, capacity=30)
|
||||
self._stats = {
|
||||
"total_requests": 0,
|
||||
"cache_hits": 0,
|
||||
"api_calls": 0
|
||||
}
|
||||
|
||||
@property
|
||||
def factory(self) -> AIProviderFactory:
|
||||
"""Lazy initialization of provider factory"""
|
||||
if self._factory is None:
|
||||
self._factory = ai_provider_factory
|
||||
return self._factory
|
||||
|
||||
def _get_cache_key(self, prompt: str, operation: str) -> str:
|
||||
"""Generate cache key from prompt and operation"""
|
||||
content = f"{operation}:{prompt[:500]}" # Limit prompt length
|
||||
return hashlib.sha256(content.encode()).hexdigest()
|
||||
|
||||
def generate_text(
|
||||
self,
|
||||
prompt: str,
|
||||
provider: Optional[str] = None,
|
||||
max_tokens: int = 4096
|
||||
) -> str:
|
||||
"""Generate text using AI provider with caching"""
|
||||
self._stats["total_requests"] += 1
|
||||
|
||||
cache_key = self._get_cache_key(prompt, f"generate:{provider or 'default'}")
|
||||
|
||||
# Check cache
|
||||
cached_result = self._prompt_cache.get(cache_key)
|
||||
if cached_result:
|
||||
self._stats["cache_hits"] += 1
|
||||
self.logger.debug(f"Cache hit for generate_text ({len(cached_result)} chars)")
|
||||
return cached_result
|
||||
|
||||
# Apply rate limiting
|
||||
wait_time = self._rate_limiter.acquire()
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
|
||||
try:
|
||||
self._stats["api_calls"] += 1
|
||||
ai_provider = self.factory.get_provider(provider or 'gemini')
|
||||
result = ai_provider.generate(prompt, max_tokens=max_tokens)
|
||||
|
||||
# Cache result
|
||||
self._prompt_cache.set(cache_key, result)
|
||||
|
||||
return result
|
||||
except AIProcessingError as e:
|
||||
self.logger.error(f"AI generation failed: {e}")
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
def summarize(self, text: str, **kwargs) -> str:
|
||||
"""Generate summary of text with caching"""
|
||||
self._stats["total_requests"] += 1
|
||||
|
||||
cache_key = self._get_cache_key(text, "summarize")
|
||||
|
||||
cached_result = self._prompt_cache.get(cache_key)
|
||||
if cached_result:
|
||||
self._stats["cache_hits"] += 1
|
||||
self.logger.debug(f"Cache hit for summarize ({len(cached_result)} chars)")
|
||||
return cached_result
|
||||
|
||||
wait_time = self._rate_limiter.acquire()
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
|
||||
try:
|
||||
self._stats["api_calls"] += 1
|
||||
provider = self.factory.get_best_provider()
|
||||
result = provider.summarize(text, **kwargs)
|
||||
|
||||
self._prompt_cache.set(cache_key, result)
|
||||
return result
|
||||
except AIProcessingError as e:
|
||||
self.logger.error(f"Summarization failed: {e}")
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
def correct_text(self, text: str, **kwargs) -> str:
|
||||
"""Correct grammar and spelling with caching"""
|
||||
self._stats["total_requests"] += 1
|
||||
|
||||
cache_key = self._get_cache_key(text, "correct")
|
||||
|
||||
cached_result = self._prompt_cache.get(cache_key)
|
||||
if cached_result:
|
||||
self._stats["cache_hits"] += 1
|
||||
return cached_result
|
||||
|
||||
wait_time = self._rate_limiter.acquire()
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
|
||||
try:
|
||||
self._stats["api_calls"] += 1
|
||||
provider = self.factory.get_best_provider()
|
||||
result = provider.correct_text(text, **kwargs)
|
||||
|
||||
self._prompt_cache.set(cache_key, result)
|
||||
return result
|
||||
except AIProcessingError as e:
|
||||
self.logger.error(f"Text correction failed: {e}")
|
||||
return text
|
||||
|
||||
def classify_content(self, text: str, **kwargs) -> Dict[str, Any]:
|
||||
"""Classify content into categories with caching"""
|
||||
self._stats["total_requests"] += 1
|
||||
|
||||
# For classification, use a shorter text for cache key
|
||||
short_text = text[:200]
|
||||
cache_key = self._get_cache_key(short_text, "classify")
|
||||
|
||||
cached_result = self._prompt_cache.get(cache_key)
|
||||
if cached_result:
|
||||
self._stats["cache_hits"] += 1
|
||||
import json
|
||||
return json.loads(cached_result)
|
||||
|
||||
wait_time = self._rate_limiter.acquire()
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
|
||||
try:
|
||||
self._stats["api_calls"] += 1
|
||||
provider = self.factory.get_best_provider()
|
||||
result = provider.classify_content(text, **kwargs)
|
||||
|
||||
import json
|
||||
self._prompt_cache.set(cache_key, json.dumps(result))
|
||||
return result
|
||||
except AIProcessingError as e:
|
||||
self.logger.error(f"Classification failed: {e}")
|
||||
return {"category": "otras_clases", "confidence": 0.0}
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get service statistics"""
|
||||
cache_stats = self._prompt_cache.stats()
|
||||
hit_rate = (self._stats["cache_hits"] / self._stats["total_requests"] * 100) if self._stats["total_requests"] > 0 else 0
|
||||
|
||||
return {
|
||||
**self._stats,
|
||||
"cache_size": cache_stats["size"],
|
||||
"cache_max_size": cache_stats["max_size"],
|
||||
"cache_hit_rate": round(hit_rate, 2),
|
||||
"rate_limiter": {
|
||||
"tokens": self._rate_limiter.tokens,
|
||||
"capacity": self._rate_limiter.capacity
|
||||
}
|
||||
}
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Clear the prompt cache"""
|
||||
self._prompt_cache = LRUCache(max_size=100, ttl=3600)
|
||||
self.logger.info("AI service cache cleared")
|
||||
|
||||
|
||||
# Global instance
|
||||
ai_service = AIService()
|
||||
247
services/gpu_detector.py
Normal file
247
services/gpu_detector.py
Normal file
@@ -0,0 +1,247 @@
|
||||
"""
|
||||
GPU Detection and Management Service
|
||||
|
||||
Provides unified interface for detecting and using NVIDIA (CUDA), AMD (ROCm), or CPU.
|
||||
Fallback order: NVIDIA -> AMD -> CPU
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import shutil
|
||||
from enum import Enum
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import torch
|
||||
try:
|
||||
import torch
|
||||
TORCH_AVAILABLE = True
|
||||
except ImportError:
|
||||
TORCH_AVAILABLE = False
|
||||
|
||||
|
||||
class GPUType(Enum):
|
||||
"""Supported GPU types"""
|
||||
NVIDIA = "nvidia"
|
||||
AMD = "amd"
|
||||
CPU = "cpu"
|
||||
|
||||
|
||||
class GPUDetector:
|
||||
"""
|
||||
Service for detecting and managing GPU resources.
|
||||
|
||||
Detects GPU type with fallback order: NVIDIA -> AMD -> CPU
|
||||
Provides unified interface regardless of GPU vendor.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._gpu_type: Optional[GPUType] = None
|
||||
self._device: Optional[str] = None
|
||||
self._initialized: bool = False
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Initialize GPU detection"""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
self._gpu_type = self._detect_gpu_type()
|
||||
self._device = self._get_device_string()
|
||||
self._setup_environment()
|
||||
self._initialized = True
|
||||
|
||||
logger.info(f"GPU Detector initialized: {self._gpu_type.value} -> {self._device}")
|
||||
|
||||
def _detect_gpu_type(self) -> GPUType:
|
||||
"""
|
||||
Detect available GPU type.
|
||||
Order: NVIDIA -> AMD -> CPU
|
||||
"""
|
||||
# Check user preference first
|
||||
preference = os.getenv("GPU_PREFERENCE", "auto").lower()
|
||||
if preference == "cpu":
|
||||
logger.info("GPU preference set to CPU, skipping GPU detection")
|
||||
return GPUType.CPU
|
||||
|
||||
if not TORCH_AVAILABLE:
|
||||
logger.warning("PyTorch not available, using CPU")
|
||||
return GPUType.CPU
|
||||
|
||||
# Check NVIDIA first
|
||||
if preference in ("auto", "nvidia"):
|
||||
if self._check_nvidia():
|
||||
logger.info("NVIDIA GPU detected via nvidia-smi")
|
||||
return GPUType.NVIDIA
|
||||
|
||||
# Check AMD second
|
||||
if preference in ("auto", "amd"):
|
||||
if self._check_amd():
|
||||
logger.info("AMD GPU detected via ROCm")
|
||||
return GPUType.AMD
|
||||
|
||||
# Fallback to checking torch.cuda (works for both NVIDIA and ROCm)
|
||||
if torch.cuda.is_available():
|
||||
device_name = torch.cuda.get_device_name(0).lower()
|
||||
if "nvidia" in device_name or "geforce" in device_name or "rtx" in device_name or "gtx" in device_name:
|
||||
return GPUType.NVIDIA
|
||||
elif "amd" in device_name or "radeon" in device_name or "rx" in device_name:
|
||||
return GPUType.AMD
|
||||
else:
|
||||
# Unknown GPU vendor but CUDA works
|
||||
logger.warning(f"Unknown GPU vendor: {device_name}, treating as NVIDIA-compatible")
|
||||
return GPUType.NVIDIA
|
||||
|
||||
logger.info("No GPU detected, using CPU")
|
||||
return GPUType.CPU
|
||||
|
||||
def _check_nvidia(self) -> bool:
|
||||
"""Check if NVIDIA GPU is available using nvidia-smi"""
|
||||
nvidia_smi = shutil.which("nvidia-smi")
|
||||
if not nvidia_smi:
|
||||
return False
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[nvidia_smi, "--query-gpu=name", "--format=csv,noheader"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
return result.returncode == 0 and result.stdout.strip()
|
||||
except Exception as e:
|
||||
logger.debug(f"nvidia-smi check failed: {e}")
|
||||
return False
|
||||
|
||||
def _check_amd(self) -> bool:
|
||||
"""Check if AMD GPU is available using rocm-smi"""
|
||||
rocm_smi = shutil.which("rocm-smi")
|
||||
if not rocm_smi:
|
||||
return False
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[rocm_smi, "--showproductname"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
return result.returncode == 0 and "GPU" in result.stdout
|
||||
except Exception as e:
|
||||
logger.debug(f"rocm-smi check failed: {e}")
|
||||
return False
|
||||
|
||||
def _setup_environment(self) -> None:
|
||||
"""Set up environment variables for detected GPU"""
|
||||
if self._gpu_type == GPUType.AMD:
|
||||
# Set HSA override for AMD RX 6000 series (gfx1030)
|
||||
hsa_version = os.getenv("HSA_OVERRIDE_GFX_VERSION", "10.3.0")
|
||||
os.environ.setdefault("HSA_OVERRIDE_GFX_VERSION", hsa_version)
|
||||
logger.info(f"Set HSA_OVERRIDE_GFX_VERSION={hsa_version}")
|
||||
|
||||
def _get_device_string(self) -> str:
|
||||
"""Get PyTorch device string"""
|
||||
if self._gpu_type in (GPUType.NVIDIA, GPUType.AMD):
|
||||
return "cuda"
|
||||
return "cpu"
|
||||
|
||||
@property
|
||||
def gpu_type(self) -> GPUType:
|
||||
"""Get detected GPU type"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return self._gpu_type
|
||||
|
||||
@property
|
||||
def device(self) -> str:
|
||||
"""Get device string for PyTorch"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return self._device
|
||||
|
||||
def get_device(self) -> "torch.device":
|
||||
"""Get PyTorch device object"""
|
||||
if not TORCH_AVAILABLE:
|
||||
raise RuntimeError("PyTorch not available")
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return torch.device(self._device)
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if GPU is available"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return self._gpu_type in (GPUType.NVIDIA, GPUType.AMD)
|
||||
|
||||
def is_nvidia(self) -> bool:
|
||||
"""Check if NVIDIA GPU is being used"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return self._gpu_type == GPUType.NVIDIA
|
||||
|
||||
def is_amd(self) -> bool:
|
||||
"""Check if AMD GPU is being used"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return self._gpu_type == GPUType.AMD
|
||||
|
||||
def is_cpu(self) -> bool:
|
||||
"""Check if CPU is being used"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return self._gpu_type == GPUType.CPU
|
||||
|
||||
def get_device_name(self) -> str:
|
||||
"""Get GPU device name"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
|
||||
if self._gpu_type == GPUType.CPU:
|
||||
return "CPU"
|
||||
|
||||
if TORCH_AVAILABLE and torch.cuda.is_available():
|
||||
return torch.cuda.get_device_name(0)
|
||||
|
||||
return "Unknown"
|
||||
|
||||
def get_memory_info(self) -> Dict[str, Any]:
|
||||
"""Get GPU memory information"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
|
||||
if self._gpu_type == GPUType.CPU:
|
||||
return {"type": "cpu", "error": "No GPU available"}
|
||||
|
||||
if not TORCH_AVAILABLE or not torch.cuda.is_available():
|
||||
return {"type": self._gpu_type.value, "error": "CUDA not available"}
|
||||
|
||||
try:
|
||||
props = torch.cuda.get_device_properties(0)
|
||||
total = props.total_memory / 1024**3
|
||||
allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||||
reserved = torch.cuda.memory_reserved(0) / 1024**3
|
||||
|
||||
return {
|
||||
"type": self._gpu_type.value,
|
||||
"device_name": props.name,
|
||||
"total_gb": round(total, 2),
|
||||
"allocated_gb": round(allocated, 2),
|
||||
"reserved_gb": round(reserved, 2),
|
||||
"free_gb": round(total - allocated, 2),
|
||||
"usage_percent": round((allocated / total) * 100, 1)
|
||||
}
|
||||
except Exception as e:
|
||||
return {"type": self._gpu_type.value, "error": str(e)}
|
||||
|
||||
def empty_cache(self) -> None:
|
||||
"""Clear GPU memory cache"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
|
||||
if TORCH_AVAILABLE and torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
logger.debug("GPU cache cleared")
|
||||
|
||||
|
||||
# Global singleton instance
|
||||
gpu_detector = GPUDetector()
|
||||
137
services/metrics_collector.py
Normal file
137
services/metrics_collector.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Performance metrics collector for CBCFacil
|
||||
"""
|
||||
import time
|
||||
import threading
|
||||
import psutil
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from contextlib import contextmanager
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""Collect and aggregate performance metrics"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._start_time = time.time()
|
||||
self._request_count = 0
|
||||
self._error_count = 0
|
||||
self._total_latency = 0.0
|
||||
self._latencies = []
|
||||
self._lock = threading.Lock()
|
||||
self._process = psutil.Process()
|
||||
|
||||
def record_request(self, latency: float, success: bool = True) -> None:
|
||||
"""Record a request with latency"""
|
||||
with self._lock:
|
||||
self._request_count += 1
|
||||
self._total_latency += latency
|
||||
self._latencies.append(latency)
|
||||
|
||||
# Keep only last 1000 latencies for memory efficiency
|
||||
if len(self._latencies) > 1000:
|
||||
self._latencies = self._latencies[-1000:]
|
||||
|
||||
if not success:
|
||||
self._error_count += 1
|
||||
|
||||
def get_latency_percentiles(self) -> Dict[str, float]:
|
||||
"""Calculate latency percentiles"""
|
||||
with self._lock:
|
||||
if not self._latencies:
|
||||
return {"p50": 0, "p95": 0, "p99": 0}
|
||||
|
||||
sorted_latencies = sorted(self._latencies)
|
||||
n = len(sorted_latencies)
|
||||
|
||||
return {
|
||||
"p50": sorted_latencies[int(n * 0.50)],
|
||||
"p95": sorted_latencies[int(n * 0.95)],
|
||||
"p99": sorted_latencies[int(n * 0.99)]
|
||||
}
|
||||
|
||||
def get_system_metrics(self) -> Dict[str, Any]:
|
||||
"""Get system resource metrics"""
|
||||
try:
|
||||
memory = self._process.memory_info()
|
||||
cpu_percent = self._process.cpu_percent(interval=0.1)
|
||||
|
||||
return {
|
||||
"cpu_percent": cpu_percent,
|
||||
"memory_rss_mb": memory.rss / 1024 / 1024,
|
||||
"memory_vms_mb": memory.vms / 1024 / 1024,
|
||||
"thread_count": self._process.num_threads(),
|
||||
"open_files": self._process.open_files(),
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error getting system metrics: {e}")
|
||||
return {}
|
||||
|
||||
def get_summary(self) -> Dict[str, Any]:
|
||||
"""Get metrics summary"""
|
||||
with self._lock:
|
||||
uptime = time.time() - self._start_time
|
||||
latency_pcts = self.get_latency_percentiles()
|
||||
|
||||
return {
|
||||
"uptime_seconds": round(uptime, 2),
|
||||
"total_requests": self._request_count,
|
||||
"error_count": self._error_count,
|
||||
"error_rate": round(self._error_count / max(1, self._request_count) * 100, 2),
|
||||
"requests_per_second": round(self._request_count / max(1, uptime), 2),
|
||||
"average_latency_ms": round(self._total_latency / max(1, self._request_count) * 1000, 2),
|
||||
"latency_p50_ms": round(latency_pcts["p50"] * 1000, 2),
|
||||
"latency_p95_ms": round(latency_pcts["p95"] * 1000, 2),
|
||||
"latency_p99_ms": round(latency_pcts["p99"] * 1000, 2),
|
||||
}
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset metrics"""
|
||||
with self._lock:
|
||||
self._request_count = 0
|
||||
self._error_count = 0
|
||||
self._total_latency = 0.0
|
||||
self._latencies = []
|
||||
self._start_time = time.time()
|
||||
|
||||
|
||||
class LatencyTracker:
|
||||
"""Context manager for tracking operation latency"""
|
||||
|
||||
def __init__(self, collector: MetricsCollector, operation: str):
|
||||
self.collector = collector
|
||||
self.operation = operation
|
||||
self.start_time: Optional[float] = None
|
||||
self.success = True
|
||||
|
||||
def __enter__(self):
|
||||
self.start_time = time.time()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
latency = time.time() - self.start_time
|
||||
success = exc_type is None
|
||||
self.collector.record_request(latency, success)
|
||||
return False # Don't suppress exceptions
|
||||
|
||||
|
||||
# Global metrics collector
|
||||
metrics_collector = MetricsCollector()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def track_latency(operation: str = "unknown"):
|
||||
"""Convenience function for latency tracking"""
|
||||
with LatencyTracker(metrics_collector, operation):
|
||||
yield
|
||||
|
||||
|
||||
def get_performance_report() -> Dict[str, Any]:
|
||||
"""Generate comprehensive performance report"""
|
||||
return {
|
||||
"metrics": metrics_collector.get_summary(),
|
||||
"system": metrics_collector.get_system_metrics(),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
91
services/telegram_service.py
Normal file
91
services/telegram_service.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""
|
||||
Telegram notification service
|
||||
"""
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
from ..config import settings
|
||||
|
||||
try:
|
||||
import requests
|
||||
REQUESTS_AVAILABLE = True
|
||||
except ImportError:
|
||||
REQUESTS_AVAILABLE = False
|
||||
|
||||
|
||||
class TelegramService:
|
||||
"""Service for sending Telegram notifications"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._token: Optional[str] = None
|
||||
self._chat_id: Optional[str] = None
|
||||
self._last_error_cache: dict = {}
|
||||
|
||||
def configure(self, token: str, chat_id: str) -> None:
|
||||
"""Configure Telegram credentials"""
|
||||
self._token = token
|
||||
self._chat_id = chat_id
|
||||
self.logger.info("Telegram service configured")
|
||||
|
||||
@property
|
||||
def is_configured(self) -> bool:
|
||||
"""Check if Telegram is configured"""
|
||||
return bool(self._token and self._chat_id)
|
||||
|
||||
def _send_request(self, endpoint: str, data: dict, retries: int = 3, delay: int = 2) -> bool:
|
||||
"""Make API request to Telegram"""
|
||||
if not REQUESTS_AVAILABLE:
|
||||
self.logger.warning("requests library not available")
|
||||
return False
|
||||
|
||||
url = f"https://api.telegram.org/bot{self._token}/{endpoint}"
|
||||
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
resp = requests.post(url, data=data, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
self.logger.error(f"Telegram API error: {resp.status_code}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Telegram request failed (attempt {attempt+1}/{retries}): {e}")
|
||||
time.sleep(delay)
|
||||
return False
|
||||
|
||||
def send_message(self, message: str) -> bool:
|
||||
"""Send a text message to Telegram"""
|
||||
if not self.is_configured:
|
||||
self.logger.warning("Telegram not configured, skipping notification")
|
||||
return False
|
||||
data = {"chat_id": self._chat_id, "text": message}
|
||||
return self._send_request("sendMessage", data)
|
||||
|
||||
def send_start_notification(self) -> bool:
|
||||
"""Send service start notification"""
|
||||
message = "CBCFacil Service Started - AI document processing active"
|
||||
return self.send_message(message)
|
||||
|
||||
def send_error_notification(self, error_key: str, error_message: str) -> bool:
|
||||
"""Send error notification with throttling"""
|
||||
now = datetime.utcnow()
|
||||
prev = self._last_error_cache.get(error_key)
|
||||
if prev is None:
|
||||
self._last_error_cache[error_key] = (error_message, now)
|
||||
else:
|
||||
prev_msg, prev_time = prev
|
||||
if error_message != prev_msg or (now - prev_time).total_seconds() > settings.ERROR_THROTTLE_SECONDS:
|
||||
self._last_error_cache[error_key] = (error_message, now)
|
||||
else:
|
||||
return False
|
||||
return self.send_message(f"Error: {error_message}")
|
||||
|
||||
|
||||
# Global instance
|
||||
telegram_service = TelegramService()
|
||||
|
||||
|
||||
def send_telegram_message(message: str, retries: int = 3, delay: int = 2) -> bool:
|
||||
"""Legacy function for backward compatibility"""
|
||||
return telegram_service.send_message(message)
|
||||
172
services/vram_manager.py
Normal file
172
services/vram_manager.py
Normal file
@@ -0,0 +1,172 @@
|
||||
"""
|
||||
VRAM/GPU memory management service
|
||||
"""
|
||||
import gc
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, Dict, Any
|
||||
from ..core import BaseService
|
||||
from ..config import settings
|
||||
|
||||
try:
|
||||
import torch
|
||||
TORCH_AVAILABLE = True
|
||||
except ImportError:
|
||||
TORCH_AVAILABLE = False
|
||||
|
||||
# Import gpu_detector after torch check
|
||||
from .gpu_detector import gpu_detector, GPUType
|
||||
|
||||
|
||||
class VRAMManager(BaseService):
|
||||
"""Service for managing GPU VRAM usage"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("VRAMManager")
|
||||
self._whisper_model = None
|
||||
self._ocr_models = None
|
||||
self._trocr_models = None
|
||||
self._models_last_used: Optional[datetime] = None
|
||||
self._cleanup_threshold = 0.7
|
||||
self._cleanup_interval = 300
|
||||
self._last_cleanup: Optional[datetime] = None
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Initialize VRAM manager"""
|
||||
# Initialize GPU detector first
|
||||
gpu_detector.initialize()
|
||||
|
||||
if not TORCH_AVAILABLE:
|
||||
self.logger.warning("PyTorch not available - VRAM management disabled")
|
||||
return
|
||||
|
||||
if gpu_detector.is_available():
|
||||
gpu_type = gpu_detector.gpu_type
|
||||
device_name = gpu_detector.get_device_name()
|
||||
|
||||
if gpu_type == GPUType.AMD:
|
||||
self.logger.info(f"VRAM Manager initialized with AMD ROCm: {device_name}")
|
||||
elif gpu_type == GPUType.NVIDIA:
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = settings.CUDA_VISIBLE_DEVICES
|
||||
if settings.PYTORCH_CUDA_ALLOC_CONF:
|
||||
torch.backends.cuda.max_split_size_mb = int(settings.PYTORCH_CUDA_ALLOC_CONF.split(':')[1])
|
||||
self.logger.info(f"VRAM Manager initialized with NVIDIA CUDA: {device_name}")
|
||||
else:
|
||||
self.logger.warning("No GPU available - GPU acceleration disabled")
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup all GPU models"""
|
||||
if not TORCH_AVAILABLE or not torch.cuda.is_available():
|
||||
return
|
||||
|
||||
models_freed = []
|
||||
|
||||
if self._whisper_model is not None:
|
||||
try:
|
||||
del self._whisper_model
|
||||
self._whisper_model = None
|
||||
models_freed.append("Whisper")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error freeing Whisper VRAM: {e}")
|
||||
|
||||
if self._ocr_models is not None:
|
||||
try:
|
||||
self._ocr_models = None
|
||||
models_freed.append("OCR")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error freeing OCR VRAM: {e}")
|
||||
|
||||
if self._trocr_models is not None:
|
||||
try:
|
||||
if isinstance(self._trocr_models, dict):
|
||||
model = self._trocr_models.get('model')
|
||||
if model is not None:
|
||||
model.to('cpu')
|
||||
models_freed.append("TrOCR")
|
||||
torch.cuda.empty_cache()
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error freeing TrOCR VRAM: {e}")
|
||||
|
||||
self._whisper_model = None
|
||||
self._ocr_models = None
|
||||
self._trocr_models = None
|
||||
self._models_last_used = None
|
||||
|
||||
if models_freed:
|
||||
self.logger.info(f"Freed VRAM for models: {', '.join(models_freed)}")
|
||||
|
||||
self._force_aggressive_cleanup()
|
||||
|
||||
def update_usage(self) -> None:
|
||||
"""Update usage timestamp"""
|
||||
self._models_last_used = datetime.utcnow()
|
||||
self.logger.debug(f"VRAM usage timestamp updated")
|
||||
|
||||
def should_cleanup(self) -> bool:
|
||||
"""Check if cleanup should be performed"""
|
||||
if not TORCH_AVAILABLE or not torch.cuda.is_available():
|
||||
return False
|
||||
if self._last_cleanup is None:
|
||||
return True
|
||||
if (datetime.utcnow() - self._last_cleanup).total_seconds() < self._cleanup_interval:
|
||||
return False
|
||||
allocated = torch.cuda.memory_allocated(0)
|
||||
total = torch.cuda.get_device_properties(0).total_memory
|
||||
return allocated / total > self._cleanup_threshold
|
||||
|
||||
def lazy_cleanup(self) -> None:
|
||||
"""Perform cleanup if needed"""
|
||||
if self.should_cleanup():
|
||||
self.cleanup()
|
||||
self._last_cleanup = datetime.utcnow()
|
||||
|
||||
def _force_aggressive_cleanup(self) -> None:
|
||||
"""Force aggressive VRAM cleanup"""
|
||||
if not TORCH_AVAILABLE or not torch.cuda.is_available():
|
||||
return
|
||||
try:
|
||||
before_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||||
before_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
||||
self.logger.debug(f"Before cleanup - Allocated: {before_allocated:.2f}GB, Reserved: {before_reserved:.2f}GB")
|
||||
gc.collect(0)
|
||||
torch.cuda.empty_cache()
|
||||
after_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||||
after_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
||||
self.logger.debug(f"After cleanup - Allocated: {after_allocated:.2f}GB, Reserved: {after_reserved:.2f}GB")
|
||||
if after_reserved < before_reserved:
|
||||
self.logger.info(f"VRAM freed: {(before_reserved - after_reserved):.2f}GB")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in aggressive VRAM cleanup: {e}")
|
||||
|
||||
def get_usage(self) -> Dict[str, Any]:
|
||||
"""Get VRAM usage information"""
|
||||
if not TORCH_AVAILABLE:
|
||||
return {'error': 'PyTorch not available'}
|
||||
if not torch.cuda.is_available():
|
||||
return {'error': 'CUDA not available'}
|
||||
total = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
||||
allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||||
cached = torch.cuda.memory_reserved(0) / 1024**3
|
||||
free = total - allocated
|
||||
return {
|
||||
'total_gb': round(total, 2),
|
||||
'allocated_gb': round(allocated, 2),
|
||||
'cached_gb': round(cached, 2),
|
||||
'free_gb': round(free, 2),
|
||||
'whisper_loaded': self._whisper_model is not None,
|
||||
'ocr_models_loaded': self._ocr_models is not None,
|
||||
'trocr_models_loaded': self._trocr_models is not None,
|
||||
'last_used': self._models_last_used.isoformat() if self._models_last_used else None,
|
||||
'timeout_seconds': settings.MODEL_TIMEOUT_SECONDS
|
||||
}
|
||||
|
||||
def force_free(self) -> str:
|
||||
"""Force immediate VRAM free"""
|
||||
self.cleanup()
|
||||
return "VRAM freed successfully"
|
||||
|
||||
|
||||
# Global instance
|
||||
vram_manager = VRAMManager()
|
||||
200
services/webdav_service.py
Normal file
200
services/webdav_service.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""
|
||||
WebDAV service for Nextcloud integration
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import unicodedata
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict
|
||||
from contextlib import contextmanager
|
||||
import requests
|
||||
from requests.auth import HTTPBasicAuth
|
||||
from requests.adapters import HTTPAdapter
|
||||
|
||||
from ..config import settings
|
||||
from ..core import WebDAVError
|
||||
|
||||
|
||||
class WebDAVService:
|
||||
"""Service for WebDAV operations with Nextcloud"""
|
||||
|
||||
def __init__(self):
|
||||
self.session: Optional[requests.Session] = None
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._retry_delay = 1
|
||||
self._max_retries = settings.WEBDAV_MAX_RETRIES
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Initialize WebDAV session"""
|
||||
if not settings.has_webdav_config:
|
||||
raise WebDAVError("WebDAV credentials not configured")
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.auth = HTTPBasicAuth(settings.NEXTCLOUD_USER, settings.NEXTCLOUD_PASSWORD)
|
||||
|
||||
# Configure HTTP adapter with retry strategy
|
||||
adapter = HTTPAdapter(
|
||||
max_retries=0, # We'll handle retries manually
|
||||
pool_connections=10,
|
||||
pool_maxsize=20
|
||||
)
|
||||
self.session.mount('https://', adapter)
|
||||
self.session.mount('http://', adapter)
|
||||
|
||||
# Test connection
|
||||
try:
|
||||
self._request('GET', '', timeout=5)
|
||||
self.logger.info("WebDAV connection established")
|
||||
except Exception as e:
|
||||
raise WebDAVError(f"Failed to connect to WebDAV: {e}")
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup WebDAV session"""
|
||||
if self.session:
|
||||
self.session.close()
|
||||
self.session = None
|
||||
|
||||
@staticmethod
|
||||
def normalize_path(path: str) -> str:
|
||||
"""Normalize remote paths to a consistent representation"""
|
||||
if not path:
|
||||
return ""
|
||||
normalized = unicodedata.normalize("NFC", str(path)).strip()
|
||||
if not normalized:
|
||||
return ""
|
||||
normalized = normalized.replace("\\", "/")
|
||||
normalized = re.sub(r"/+", "/", normalized)
|
||||
return normalized.lstrip("/")
|
||||
|
||||
def _build_url(self, remote_path: str) -> str:
|
||||
"""Build WebDAV URL"""
|
||||
path = self.normalize_path(remote_path)
|
||||
base_url = settings.WEBDAV_ENDPOINT.rstrip('/')
|
||||
return f"{base_url}/{path}"
|
||||
|
||||
def _request(self, method: str, remote_path: str, **kwargs) -> requests.Response:
|
||||
"""Make HTTP request to WebDAV with retries"""
|
||||
if not self.session:
|
||||
raise WebDAVError("WebDAV session not initialized")
|
||||
|
||||
url = self._build_url(remote_path)
|
||||
timeout = kwargs.pop('timeout', settings.HTTP_TIMEOUT)
|
||||
|
||||
for attempt in range(self._max_retries):
|
||||
try:
|
||||
response = self.session.request(method, url, timeout=timeout, **kwargs)
|
||||
if response.status_code < 400:
|
||||
return response
|
||||
elif response.status_code == 404:
|
||||
raise WebDAVError(f"Resource not found: {remote_path}")
|
||||
else:
|
||||
raise WebDAVError(f"HTTP {response.status_code}: {response.text}")
|
||||
except (requests.RequestException, requests.Timeout) as e:
|
||||
if attempt == self._max_retries - 1:
|
||||
raise WebDAVError(f"Request failed after {self._max_retries} retries: {e}")
|
||||
delay = self._retry_delay * (2 ** attempt)
|
||||
self.logger.warning(f"Request failed (attempt {attempt + 1}/{self._max_retries}), retrying in {delay}s...")
|
||||
time.sleep(delay)
|
||||
|
||||
raise WebDAVError("Max retries exceeded")
|
||||
|
||||
def list(self, remote_path: str = "") -> List[str]:
|
||||
"""List files in remote directory"""
|
||||
self.logger.debug(f"Listing remote directory: {remote_path}")
|
||||
response = self._request('PROPFIND', remote_path, headers={'Depth': '1'})
|
||||
return self._parse_propfind_response(response.text)
|
||||
|
||||
def _parse_propfind_response(self, xml_response: str) -> List[str]:
|
||||
"""Parse PROPFIND XML response"""
|
||||
# Simple parser for PROPFIND response
|
||||
files = []
|
||||
try:
|
||||
import xml.etree.ElementTree as ET
|
||||
root = ET.fromstring(xml_response)
|
||||
|
||||
# Find all href elements
|
||||
for href in root.findall('.//{DAV:}href'):
|
||||
href_text = href.text or ""
|
||||
# Remove base URL from href
|
||||
base_url = settings.NEXTCLOUD_URL.rstrip('/')
|
||||
if href_text.startswith(base_url):
|
||||
href_text = href_text[len(base_url):]
|
||||
files.append(href_text.lstrip('/'))
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error parsing PROPFIND response: {e}")
|
||||
|
||||
return files
|
||||
|
||||
def download(self, remote_path: str, local_path: Path) -> None:
|
||||
"""Download file from WebDAV"""
|
||||
self.logger.info(f"Downloading {remote_path} to {local_path}")
|
||||
|
||||
# Ensure local directory exists
|
||||
local_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
response = self._request('GET', remote_path, stream=True)
|
||||
|
||||
# Use larger buffer size for better performance
|
||||
with open(local_path, 'wb', buffering=65536) as f:
|
||||
for chunk in response.iter_content(chunk_size=settings.DOWNLOAD_CHUNK_SIZE):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
self.logger.debug(f"Download completed: {local_path}")
|
||||
|
||||
def upload(self, local_path: Path, remote_path: str) -> None:
|
||||
"""Upload file to WebDAV"""
|
||||
self.logger.info(f"Uploading {local_path} to {remote_path}")
|
||||
|
||||
# Ensure remote directory exists
|
||||
remote_dir = self.normalize_path(remote_path)
|
||||
if '/' in remote_dir:
|
||||
dir_path = '/'.join(remote_dir.split('/')[:-1])
|
||||
self.makedirs(dir_path)
|
||||
|
||||
with open(local_path, 'rb') as f:
|
||||
self._request('PUT', remote_path, data=f)
|
||||
|
||||
self.logger.debug(f"Upload completed: {remote_path}")
|
||||
|
||||
def mkdir(self, remote_path: str) -> None:
|
||||
"""Create directory on WebDAV"""
|
||||
self.makedirs(remote_path)
|
||||
|
||||
def makedirs(self, remote_path: str) -> None:
|
||||
"""Create directory and parent directories on WebDAV"""
|
||||
path = self.normalize_path(remote_path)
|
||||
if not path:
|
||||
return
|
||||
|
||||
parts = path.split('/')
|
||||
current = ""
|
||||
|
||||
for part in parts:
|
||||
current = f"{current}/{part}" if current else part
|
||||
try:
|
||||
self._request('MKCOL', current)
|
||||
self.logger.debug(f"Created directory: {current}")
|
||||
except WebDAVError as e:
|
||||
# Directory might already exist (409 Conflict is OK)
|
||||
if '409' not in str(e):
|
||||
raise
|
||||
|
||||
def delete(self, remote_path: str) -> None:
|
||||
"""Delete file or directory from WebDAV"""
|
||||
self.logger.info(f"Deleting remote path: {remote_path}")
|
||||
self._request('DELETE', remote_path)
|
||||
|
||||
def exists(self, remote_path: str) -> bool:
|
||||
"""Check if remote path exists"""
|
||||
try:
|
||||
self._request('HEAD', remote_path)
|
||||
return True
|
||||
except WebDAVError:
|
||||
return False
|
||||
|
||||
|
||||
# Global instance
|
||||
webdav_service = WebDAVService()
|
||||
165
setup.py
Normal file
165
setup.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Setup script for CBCFacil
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import platform
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def check_python_version():
|
||||
"""Check if Python version is 3.10 or higher"""
|
||||
if sys.version_info < (3, 10):
|
||||
print("❌ Error: Python 3.10 or higher is required")
|
||||
print(f" Current version: {sys.version}")
|
||||
sys.exit(1)
|
||||
print(f"✓ Python {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
|
||||
|
||||
|
||||
def check_system_dependencies():
|
||||
"""Check and install system dependencies"""
|
||||
system = platform.system().lower()
|
||||
|
||||
print("\n📦 Checking system dependencies...")
|
||||
|
||||
if system == "linux":
|
||||
# Check for CUDA (optional)
|
||||
if os.path.exists("/usr/local/cuda"):
|
||||
print("✓ CUDA found")
|
||||
else:
|
||||
print("⚠ CUDA not found - GPU acceleration will be disabled")
|
||||
|
||||
# Check for tesseract
|
||||
try:
|
||||
subprocess.run(["tesseract", "--version"], check=True, capture_output=True)
|
||||
print("✓ Tesseract OCR installed")
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
print("⚠ Tesseract OCR not found - PDF processing may not work")
|
||||
|
||||
# Check for ffmpeg
|
||||
try:
|
||||
subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
|
||||
print("✓ FFmpeg installed")
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
print("⚠ FFmpeg not found - audio processing may not work")
|
||||
|
||||
elif system == "darwin": # macOS
|
||||
# Check for tesseract
|
||||
try:
|
||||
subprocess.run(["brew", "list", "tesseract"], check=True, capture_output=True)
|
||||
print("✓ Tesseract OCR installed")
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
print("⚠ Tesseract not found. Install with: brew install tesseract")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def create_virtual_environment():
|
||||
"""Create Python virtual environment"""
|
||||
venv_path = Path("venv")
|
||||
|
||||
if venv_path.exists():
|
||||
print("✓ Virtual environment already exists")
|
||||
return venv_path
|
||||
|
||||
print("📦 Creating virtual environment...")
|
||||
subprocess.run([sys.executable, "-m", "venv", "venv"], check=True)
|
||||
print("✓ Virtual environment created")
|
||||
return venv_path
|
||||
|
||||
|
||||
def install_requirements(venv_path):
|
||||
"""Install Python requirements"""
|
||||
pip_path = venv_path / ("Scripts" if platform.system() == "Windows" else "bin") / "pip"
|
||||
|
||||
print("📦 Installing Python requirements...")
|
||||
subprocess.run([str(pip_path), "install", "--upgrade", "pip"], check=True)
|
||||
subprocess.run([str(pip_path), "install", "-r", "requirements.txt"], check=True)
|
||||
print("✓ Python requirements installed")
|
||||
|
||||
|
||||
def create_directories():
|
||||
"""Create necessary directories"""
|
||||
directories = [
|
||||
"downloads",
|
||||
"resumenes_docx",
|
||||
"logs",
|
||||
"processed"
|
||||
]
|
||||
|
||||
print("\n📁 Creating directories...")
|
||||
for directory in directories:
|
||||
Path(directory).mkdir(exist_ok=True)
|
||||
print(f" ✓ {directory}")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def create_env_file():
|
||||
"""Create .env file if it doesn't exist"""
|
||||
env_path = Path(".env")
|
||||
example_path = Path(".env.example")
|
||||
|
||||
if env_path.exists():
|
||||
print("✓ .env file already exists")
|
||||
return
|
||||
|
||||
if not example_path.exists():
|
||||
print("⚠ .env.example not found")
|
||||
return
|
||||
|
||||
print("\n📝 Creating .env file from template...")
|
||||
print(" Please edit .env file and add your API keys")
|
||||
|
||||
with open(example_path, "r") as src:
|
||||
content = src.read()
|
||||
|
||||
with open(env_path, "w") as dst:
|
||||
dst.write(content)
|
||||
|
||||
print("✓ .env file created from .env.example")
|
||||
print(" ⚠ Please edit .env and add your API keys!")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main setup function"""
|
||||
print("=" * 60)
|
||||
print("CBCFacil Setup Script")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# Check Python version
|
||||
check_python_version()
|
||||
|
||||
# Check system dependencies
|
||||
check_system_dependencies()
|
||||
|
||||
# Create virtual environment
|
||||
venv_path = create_virtual_environment()
|
||||
|
||||
# Install requirements
|
||||
install_requirements(venv_path)
|
||||
|
||||
# Create directories
|
||||
create_directories()
|
||||
|
||||
# Create .env file
|
||||
create_env_file()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✓ Setup complete!")
|
||||
print("=" * 60)
|
||||
print("\nNext steps:")
|
||||
print(" 1. Edit .env file and add your API keys")
|
||||
print(" 2. Run: source venv/bin/activate (Linux/macOS)")
|
||||
print(" or venv\\Scripts\\activate (Windows)")
|
||||
print(" 3. Run: python main_refactored.py")
|
||||
print("\nFor dashboard only:")
|
||||
print(" python -c \"from api.routes import create_app; app = create_app(); app.run(port=5000)\"")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
7
storage/__init__.py
Normal file
7
storage/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
Storage package for CBCFacil
|
||||
"""
|
||||
|
||||
from .processed_registry import ProcessedRegistry
|
||||
|
||||
__all__ = ['ProcessedRegistry']
|
||||
235
storage/processed_registry.py
Normal file
235
storage/processed_registry.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""
|
||||
Processed files registry - Optimized version with bloom filter and better caching
|
||||
"""
|
||||
import fcntl
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Set, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from ..config import settings
|
||||
|
||||
|
||||
class BloomFilter:
|
||||
"""Simple Bloom Filter for fast membership testing"""
|
||||
|
||||
def __init__(self, size: int = 10000, hash_count: int = 3):
|
||||
self.size = size
|
||||
self.hash_count = hash_count
|
||||
self.bit_array = [0] * size
|
||||
|
||||
def _hashes(self, item: str) -> list[int]:
|
||||
"""Generate hash positions for item"""
|
||||
import hashlib
|
||||
digest = hashlib.md5(item.encode()).digest()
|
||||
return [
|
||||
int.from_bytes(digest[i:i+4], 'big') % self.size
|
||||
for i in range(0, min(self.hash_count * 4, len(digest)), 4)
|
||||
]
|
||||
|
||||
def add(self, item: str) -> None:
|
||||
for pos in self._hashes(item):
|
||||
self.bit_array[pos] = 1
|
||||
|
||||
def might_contain(self, item: str) -> bool:
|
||||
return all(self.bit_array[pos] for pos in self._hashes(item))
|
||||
|
||||
|
||||
class ProcessedRegistry:
|
||||
"""Registry for tracking processed files with caching and file locking"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._cache: Set[str] = set()
|
||||
self._cache_time: Optional[float] = None
|
||||
self._cache_ttl = 300 # 5 minutos (antes era 60s)
|
||||
self._initialized = False
|
||||
self._bloom_filter = BloomFilter(size=10000, hash_count=3)
|
||||
self._write_lock = False # Write batching
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Initialize the registry"""
|
||||
self.load()
|
||||
self._initialized = True
|
||||
self.logger.info(f"Processed registry initialized ({self.count()} files)")
|
||||
|
||||
def load(self) -> Set[str]:
|
||||
"""Load processed files from disk with caching"""
|
||||
now = time.time()
|
||||
|
||||
# Return cached data if still valid
|
||||
if self._cache and self._cache_time:
|
||||
age = now - self._cache_time
|
||||
if age < self._cache_ttl:
|
||||
return self._cache # Return reference, not copy for read-only
|
||||
|
||||
processed = set()
|
||||
registry_path = settings.processed_files_path
|
||||
|
||||
try:
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if registry_path.exists():
|
||||
with open(registry_path, 'r', encoding='utf-8') as f:
|
||||
for raw_line in f:
|
||||
line = raw_line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
processed.add(line)
|
||||
# Add basename for both path and basename lookups
|
||||
base_name = Path(line).name
|
||||
processed.add(base_name)
|
||||
# Update bloom filter
|
||||
self._bloom_filter.add(line)
|
||||
self._bloom_filter.add(base_name)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error reading processed files registry: {e}")
|
||||
|
||||
self._cache = processed
|
||||
self._cache_time = now
|
||||
return processed # Return reference, not copy
|
||||
|
||||
def save(self, file_path: str) -> None:
|
||||
"""Add file to processed registry with file locking"""
|
||||
if not file_path:
|
||||
return
|
||||
|
||||
registry_path = settings.processed_files_path
|
||||
|
||||
try:
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Check cache first
|
||||
if file_path in self._cache:
|
||||
return
|
||||
|
||||
# Append to file
|
||||
with open(registry_path, 'a', encoding='utf-8') as f:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
f.write(file_path + "\n")
|
||||
# Update in-memory structures
|
||||
self._cache.add(file_path)
|
||||
self._bloom_filter.add(file_path)
|
||||
self._cache_time = time.time()
|
||||
self.logger.debug(f"Added {file_path} to processed registry")
|
||||
finally:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error saving to processed files registry: {e}")
|
||||
raise
|
||||
|
||||
def is_processed(self, file_path: str) -> bool:
|
||||
"""Check if file has been processed - O(1) with bloom filter"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
|
||||
# Fast bloom filter check first
|
||||
if not self._bloom_filter.might_contain(file_path):
|
||||
return False
|
||||
|
||||
# Check cache (O(1) for both full path and basename)
|
||||
if file_path in self._cache:
|
||||
return True
|
||||
basename = Path(file_path).name
|
||||
if basename in self._cache:
|
||||
return True
|
||||
return False
|
||||
|
||||
def save_batch(self, file_paths: list[str]) -> int:
|
||||
"""Add multiple files to registry efficiently"""
|
||||
saved_count = 0
|
||||
|
||||
try:
|
||||
registry_path = settings.processed_files_path
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(registry_path, 'a', encoding='utf-8') as f:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
lines_to_write = []
|
||||
for file_path in file_paths:
|
||||
if file_path and file_path not in self._cache:
|
||||
lines_to_write.append(file_path + "\n")
|
||||
self._cache.add(file_path)
|
||||
self._bloom_filter.add(file_path)
|
||||
saved_count += 1
|
||||
|
||||
if lines_to_write:
|
||||
f.writelines(lines_to_write)
|
||||
self._cache_time = time.time()
|
||||
self.logger.debug(f"Added {saved_count} files to processed registry")
|
||||
finally:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error saving batch to processed files registry: {e}")
|
||||
|
||||
return saved_count
|
||||
|
||||
def remove(self, file_path: str) -> bool:
|
||||
"""Remove file from processed registry"""
|
||||
registry_path = settings.processed_files_path
|
||||
|
||||
try:
|
||||
if not registry_path.exists():
|
||||
return False
|
||||
lines_to_keep = []
|
||||
with open(registry_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
stripped = line.strip()
|
||||
if stripped != file_path and Path(stripped).name != Path(file_path).name:
|
||||
lines_to_keep.append(line)
|
||||
with open(registry_path, 'w', encoding='utf-8') as f:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
f.writelines(lines_to_keep)
|
||||
self._cache.discard(file_path)
|
||||
self._cache.discard(Path(file_path).name)
|
||||
# Rebuild bloom filter
|
||||
self._bloom_filter = BloomFilter(size=10000, hash_count=3)
|
||||
for item in self._cache:
|
||||
self._bloom_filter.add(item)
|
||||
finally:
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error removing from processed files registry: {e}")
|
||||
return False
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear the entire registry"""
|
||||
registry_path = settings.processed_files_path
|
||||
try:
|
||||
if registry_path.exists():
|
||||
registry_path.unlink()
|
||||
self._cache.clear()
|
||||
self._cache_time = None
|
||||
self._bloom_filter = BloomFilter(size=10000, hash_count=3)
|
||||
self.logger.info("Processed files registry cleared")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error clearing processed files registry: {e}")
|
||||
raise
|
||||
|
||||
def get_all(self) -> Set[str]:
|
||||
"""Get all processed files"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return self._cache.copy()
|
||||
|
||||
def count(self) -> int:
|
||||
"""Get count of processed files"""
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
return len(self._cache)
|
||||
|
||||
def get_stats(self) -> dict:
|
||||
"""Get registry statistics"""
|
||||
return {
|
||||
"total_files": len(self._cache),
|
||||
"cache_age_seconds": time.time() - self._cache_time if self._cache_time else 0,
|
||||
"cache_ttl_seconds": self._cache_ttl,
|
||||
"bloom_filter_size": self._bloom_filter.size,
|
||||
"initialized": self._initialized
|
||||
}
|
||||
|
||||
|
||||
# Global instance
|
||||
processed_registry = ProcessedRegistry()
|
||||
BIN
test_audio/imperio.mp3
Normal file
BIN
test_audio/imperio.mp3
Normal file
Binary file not shown.
3
tests/__init__.py
Normal file
3
tests/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
Test package for CBCFacil
|
||||
"""
|
||||
20
tests/conftest.py
Normal file
20
tests/conftest.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""
|
||||
Pytest configuration for CBCFacil tests
|
||||
Ensures proper Python path for module imports
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
# Set environment variables for testing
|
||||
os.environ.setdefault('LOCAL_STATE_DIR', str(PROJECT_ROOT / 'state'))
|
||||
os.environ.setdefault('LOCAL_DOWNLOADS_PATH', str(PROJECT_ROOT / 'downloads'))
|
||||
|
||||
# Disable external service connections during tests
|
||||
os.environ.setdefault('NEXTCLOUD_URL', '')
|
||||
os.environ.setdefault('ANTHROPIC_AUTH_TOKEN', '')
|
||||
os.environ.setdefault('GEMINI_API_KEY', '')
|
||||
112
verify_improvements.py
Executable file
112
verify_improvements.py
Executable file
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script de verificación para mejoras de la Fase 3
|
||||
Verifica que todas las mejoras están correctamente implementadas
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def check_file_exists(filepath, description):
|
||||
"""Verificar que un archivo existe"""
|
||||
if Path(filepath).exists():
|
||||
print(f"✅ {description}: {filepath}")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ {description}: {filepath} NO ENCONTRADO")
|
||||
return False
|
||||
|
||||
def check_function_in_file(filepath, function_name, description):
|
||||
"""Verificar que una función existe en un archivo"""
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
content = f.read()
|
||||
if function_name in content:
|
||||
print(f"✅ {description}: Encontrado '{function_name}'")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ {description}: NO encontrado '{function_name}'")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ Error leyendo {filepath}: {e}")
|
||||
return False
|
||||
|
||||
def check_class_in_file(filepath, class_name, description):
|
||||
"""Verificar que una clase existe en un archivo"""
|
||||
return check_function_in_file(filepath, f"class {class_name}", description)
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("🔍 VERIFICACIÓN DE MEJORAS FASE 3 - CBCFacil")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
results = []
|
||||
|
||||
# 1. Verificar archivos modificados/creados
|
||||
print("📁 ARCHIVOS:")
|
||||
print("-" * 70)
|
||||
results.append(check_file_exists("main.py", "main.py modificado"))
|
||||
results.append(check_file_exists("config/settings.py", "config/settings.py modificado"))
|
||||
results.append(check_file_exists("core/health_check.py", "core/health_check.py creado"))
|
||||
results.append(check_file_exists("IMPROVEMENTS_LOG.md", "IMPROVEMENTS_LOG.md creado"))
|
||||
print()
|
||||
|
||||
# 2. Verificar mejoras en main.py
|
||||
print("🔧 MEJORAS EN main.py:")
|
||||
print("-" * 70)
|
||||
results.append(check_function_in_file("main.py", "logger.exception", "logger.exception() implementado"))
|
||||
results.append(check_function_in_file("main.py", "class JSONFormatter", "JSONFormatter implementado"))
|
||||
results.append(check_function_in_file("main.py", "def validate_configuration", "validate_configuration() implementado"))
|
||||
results.append(check_function_in_file("main.py", "def check_service_health", "check_service_health() implementado"))
|
||||
results.append(check_function_in_file("main.py", "def send_error_notification", "send_error_notification() implementado"))
|
||||
results.append(check_function_in_file("main.py", "def setup_logging", "setup_logging() implementado"))
|
||||
print()
|
||||
|
||||
# 3. Verificar mejoras en config/settings.py
|
||||
print("⚙️ MEJORAS EN config/settings.py:")
|
||||
print("-" * 70)
|
||||
results.append(check_function_in_file("config/settings.py", "class ConfigurationError", "ConfigurationError definido"))
|
||||
results.append(check_function_in_file("config/settings.py", "def nextcloud_url", "Propiedad nextcloud_url con validación"))
|
||||
results.append(check_function_in_file("config/settings.py", "def valid_webdav_config", "Propiedad valid_webdav_config"))
|
||||
results.append(check_function_in_file("config/settings.py", "def telegram_configured", "Propiedad telegram_configured"))
|
||||
results.append(check_function_in_file("config/settings.py", "def has_gpu_support", "Propiedad has_gpu_support"))
|
||||
results.append(check_function_in_file("config/settings.py", "def config_summary", "Propiedad config_summary"))
|
||||
print()
|
||||
|
||||
# 4. Verificar core/health_check.py
|
||||
print("❤️ HEALTH CHECKS:")
|
||||
print("-" * 70)
|
||||
results.append(check_function_in_file("core/health_check.py", "class HealthChecker", "Clase HealthChecker"))
|
||||
results.append(check_function_in_file("core/health_check.py", "def check_webdav_connection", "check_webdav_connection()"))
|
||||
results.append(check_function_in_file("core/health_check.py", "def check_ai_providers", "check_ai_providers()"))
|
||||
results.append(check_function_in_file("core/health_check.py", "def check_vram_manager", "check_vram_manager()"))
|
||||
results.append(check_function_in_file("core/health_check.py", "def check_disk_space", "check_disk_space()"))
|
||||
results.append(check_function_in_file("core/health_check.py", "def run_full_health_check", "run_full_health_check()"))
|
||||
print()
|
||||
|
||||
# Resumen
|
||||
print("=" * 70)
|
||||
print("📊 RESUMEN:")
|
||||
print("=" * 70)
|
||||
passed = sum(results)
|
||||
total = len(results)
|
||||
percentage = (passed / total) * 100
|
||||
|
||||
print(f"Verificaciones pasadas: {passed}/{total} ({percentage:.1f}%)")
|
||||
print()
|
||||
|
||||
if percentage == 100:
|
||||
print("🎉 ¡TODAS LAS MEJORAS ESTÁN CORRECTAMENTE IMPLEMENTADAS!")
|
||||
print()
|
||||
print("Puedes probar:")
|
||||
print(" python main.py health")
|
||||
print()
|
||||
return 0
|
||||
else:
|
||||
print("⚠️ Algunas verificaciones fallaron")
|
||||
print()
|
||||
return 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user