Compare commits

...

2 Commits

Author SHA1 Message Date
renato97
207b7856b5 Actualización: cambios varios y archivos nuevos
- Modificaciones en main.py
- Agregado kubectl
- Agregado plus.md y todo.md
2026-01-10 19:25:37 +00:00
renato97
75ef0afcb1 feat(dashboard): agregar panel de versiones y corregir carga de transcripciones
- Corregir endpoints /api/transcription y /api/summary para manejar filenames con extensión
- Agregar endpoint /api/versions para listar archivos generados
- Agregar tab 'Versiones' en panel lateral con lista de archivos
- Mejorar modal de progreso con barra animada y estados
- Cambiar archivos para que se abran en pestaña en lugar de descargarse
- Agregar botón 'Regenerar' en lista de archivos procesados
2026-01-10 19:18:14 +00:00
6 changed files with 3652 additions and 47 deletions

View File

@@ -2,6 +2,7 @@
Flask API routes for CBCFacil dashboard Flask API routes for CBCFacil dashboard
""" """
import os import os
import time
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Dict, Any, List from typing import Dict, Any, List
@@ -9,14 +10,20 @@ from flask import Flask, render_template, request, jsonify, send_from_directory
from flask_cors import CORS from flask_cors import CORS
from config import settings from config import settings
from storage import processed_registry from storage.processed_registry import processed_registry
from services.webdav_service import webdav_service from services.webdav_service import webdav_service
from services import vram_manager from services import vram_manager
from document.generators import DocumentGenerator
def create_app() -> Flask: def create_app() -> Flask:
"""Create and configure Flask application""" """Create and configure Flask application"""
app = Flask(__name__) # Get the project root directory (parent of api/)
current_dir = Path(__file__).parent
project_root = current_dir.parent
template_dir = project_root / 'templates'
app = Flask(__name__, template_folder=str(template_dir))
CORS(app) CORS(app)
# Configure app # Configure app
@@ -157,6 +164,60 @@ def create_app() -> Flask:
app.logger.error(f"Error downloading file: {e}") app.logger.error(f"Error downloading file: {e}")
return jsonify({'error': 'File not found'}), 404 return jsonify({'error': 'File not found'}), 404
@app.route('/downloads/find-file')
def find_and_download_file():
"""Find and download file with various name variants"""
try:
filename = request.args.get('filename', '')
ext = request.args.get('ext', 'txt')
if not filename:
return jsonify({'error': 'Filename required'}), 400
# Validate to prevent path traversal
if '..' in filename or filename.startswith('/'):
return jsonify({'error': 'Invalid filename'}), 400
# Try various name variants
base_name = filename.replace('_unificado', '').replace('_unified', '')
name_variants = [
f"{base_name}.{ext}",
f"{base_name}_unificado.{ext}",
f"{base_name}_unified.{ext}",
f"{base_name.replace(' ', '_')}.{ext}",
f"{base_name.replace(' ', '_')}_unificado.{ext}",
]
# Directories to search
directories = [
settings.LOCAL_DOWNLOADS_PATH,
settings.LOCAL_DOCX
]
# Search for file
for directory in directories:
if not directory.exists():
continue
for variant in name_variants:
file_path = directory / variant
if file_path.exists():
# Determinar mimetype para que se abra en el navegador
mimetype = None
if ext == 'pdf':
mimetype = 'application/pdf'
elif ext == 'md':
mimetype = 'text/markdown'
elif ext == 'txt':
mimetype = 'text/plain'
# as_attachment=False para abrir en navegador, no descargar
return send_from_directory(str(directory), variant, as_attachment=False, mimetype=mimetype)
return jsonify({'error': f'File not found: {filename}.{ext}'}), 404
except Exception as e:
app.logger.error(f"Error finding file: {e}")
return jsonify({'error': 'File not found'}), 404
@app.route('/health') @app.route('/health')
def health_check(): def health_check():
"""Health check endpoint""" """Health check endpoint"""
@@ -174,11 +235,316 @@ def create_app() -> Flask:
} }
}) })
@app.route('/api/transcription/<filename>')
def get_transcription(filename: str):
"""Get transcription content for a specific file"""
try:
# Validate filename to prevent path traversal
if '..' in filename or filename.startswith('/'):
return jsonify({
'success': False,
'message': 'Invalid filename'
}), 400
# Extract base name without extension (handle .mp3, .wav, .txt, etc.)
base_name = Path(filename).stem
# Construct file path for transcription
file_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.txt"
# Check if file exists
if not file_path.exists():
return jsonify({
'success': False,
'message': f'Transcription file not found: {base_name}.txt'
}), 404
# Read file content
with open(file_path, 'r', encoding='utf-8') as f:
transcription_text = f.read()
# Calculate statistics
word_count = len(transcription_text.split())
char_count = len(transcription_text)
return jsonify({
'success': True,
'filename': filename,
'transcription': transcription_text,
'file_path': str(file_path),
'word_count': word_count,
'char_count': char_count
})
except Exception as e:
app.logger.error(f"Error reading transcription: {e}")
return jsonify({
'success': False,
'message': f"Error reading transcription: {str(e)}"
}), 500
@app.route('/api/summary/<filename>')
def get_summary(filename: str):
"""Get summary content for a specific file"""
try:
# Validate filename to prevent path traversal
if '..' in filename or filename.startswith('/'):
return jsonify({
'success': False,
'message': 'Invalid filename'
}), 400
# Extract base name without extension (handle .mp3, .wav, etc.)
base_name = Path(filename).stem
# Also remove _unificado/_unified suffixes if present
base_name = base_name.replace('_unificado', '').replace('_unified', '')
# Try different file path variants
possible_paths = [
settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_unificado.md",
settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_unified.md",
settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.md",
]
file_path = None
for path in possible_paths:
if path.exists():
file_path = path
break
if not file_path:
return jsonify({
'success': False,
'message': f'Summary file not found for: {filename}'
}), 404
# Read file content
with open(file_path, 'r', encoding='utf-8') as f:
summary_text = f.read()
# Get available formats
formats_available = get_available_formats(base_name)
return jsonify({
'success': True,
'filename': base_name,
'summary': summary_text,
'file_path': str(file_path),
'formats_available': formats_available
})
except Exception as e:
app.logger.error(f"Error reading summary: {e}")
return jsonify({
'success': False,
'message': f"Error reading summary: {str(e)}"
}), 500
@app.route('/api/versions/<filename>')
def get_versions(filename: str):
"""Get all summary versions for a file"""
try:
# Validate filename
if '..' in filename or filename.startswith('/'):
return jsonify({'success': False, 'message': 'Invalid filename'}), 400
# Extract base name
base_name = Path(filename).stem
base_name = base_name.replace('_unificado', '').replace('_unified', '')
versions = []
downloads_path = settings.LOCAL_DOWNLOADS_PATH
docx_path = settings.LOCAL_DOCX
# Check for transcription (original)
txt_path = downloads_path / f"{base_name}.txt"
if txt_path.exists():
stat = txt_path.stat()
versions.append({
'type': 'transcription',
'label': '📝 Transcripción Original',
'filename': txt_path.name,
'path': f"/downloads/find-file?filename={base_name}&ext=txt",
'date': datetime.fromtimestamp(stat.st_mtime).strftime('%Y-%m-%d %H:%M'),
'size': f"{stat.st_size / 1024:.1f} KB"
})
# Check for summary versions (md, docx, pdf)
summary_patterns = [
(f"{base_name}_unificado.md", "📋 Resumen MD"),
(f"{base_name}_unificado.docx", "📄 Documento DOCX"),
(f"{base_name}_unificado.pdf", "📑 PDF"),
]
for pattern, label in summary_patterns:
# Check downloads path
file_path = downloads_path / pattern
if not file_path.exists():
file_path = docx_path / pattern
if file_path.exists():
stat = file_path.stat()
ext = file_path.suffix[1:] # Remove the dot
versions.append({
'type': 'summary',
'label': label,
'filename': pattern,
'path': f"/downloads/find-file?filename={base_name}&ext={ext}",
'date': datetime.fromtimestamp(stat.st_mtime).strftime('%Y-%m-%d %H:%M'),
'size': f"{stat.st_size / 1024:.1f} KB"
})
# Sort by date descending
versions.sort(key=lambda x: x['date'], reverse=True)
return jsonify({
'success': True,
'base_name': base_name,
'versions': versions,
'count': len(versions)
})
except Exception as e:
app.logger.error(f"Error getting versions: {e}")
return jsonify({'success': False, 'message': str(e)}), 500
@app.route('/api/regenerate-summary', methods=['POST'])
def regenerate_summary():
"""Regenerate summary from existing transcription"""
start_time = time.time()
try:
data = request.get_json()
filename = data.get('filename')
custom_prompt = data.get('custom_prompt')
if not filename:
return jsonify({
'success': False,
'message': 'Filename is required'
}), 400
# Validate filename to prevent path traversal
if '..' in filename or filename.startswith('/'):
return jsonify({
'success': False,
'message': 'Invalid filename'
}), 400
# Get base name (remove extension if present)
base_name = Path(filename).stem
# Read transcription from .txt file
transcription_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.txt"
if not transcription_path.exists():
# Try without .txt extension if already included
transcription_path = settings.LOCAL_DOWNLOADS_PATH / filename
if not transcription_path.exists():
return jsonify({
'success': False,
'message': f'Transcription file not found for: {filename}'
}), 404
with open(transcription_path, 'r', encoding='utf-8') as f:
transcription_text = f.read()
# Generate new summary using DocumentGenerator
doc_generator = DocumentGenerator()
success, new_summary, metadata = doc_generator.generate_summary(
transcription_text,
base_name
)
if not success:
return jsonify({
'success': False,
'message': 'Failed to generate summary'
}), 500
# Upload to WebDAV if configured
files_updated = []
if settings.has_webdav_config:
try:
# Upload markdown
if 'markdown_path' in metadata:
md_path = Path(metadata['markdown_path'])
if md_path.exists():
remote_md_path = f"{settings.REMOTE_TXT_FOLDER}/{md_path.name}"
webdav_service.upload(str(md_path), remote_md_path)
files_updated.append(remote_md_path)
# Upload DOCX
if 'docx_path' in metadata:
docx_path = Path(metadata['docx_path'])
if docx_path.exists():
remote_docx_path = f"{settings.DOCX_FOLDER}/{docx_path.name}"
webdav_service.upload(str(docx_path), remote_docx_path)
files_updated.append(remote_docx_path)
# Upload PDF if available
if 'pdf_path' in metadata:
pdf_path = Path(metadata['pdf_path'])
if pdf_path.exists():
remote_pdf_path = f"{settings.REMOTE_PDF_FOLDER}/{pdf_path.name}"
webdav_service.upload(str(pdf_path), remote_pdf_path)
files_updated.append(remote_pdf_path)
except Exception as e:
app.logger.warning(f"WebDAV upload failed: {e}")
# Continue even if upload fails
processing_time = time.time() - start_time
return jsonify({
'success': True,
'message': 'Summary regenerated successfully',
'new_summary': new_summary,
'files_updated': files_updated,
'processing_time': f"{processing_time:.2f}s",
'metadata': metadata
})
except Exception as e:
app.logger.error(f"Error regenerating summary: {e}")
return jsonify({
'success': False,
'message': f"Error regenerating summary: {str(e)}"
}), 500
@app.route('/api/files-detailed')
def get_files_detailed():
"""Get detailed list of files with transcription and summary info"""
try:
files = get_audio_files_detailed()
# Calculate statistics
total = len(files)
with_transcription = sum(1 for f in files if f['has_transcription'])
with_summary = sum(1 for f in files if f['has_summary'])
return jsonify({
'success': True,
'files': files,
'total': total,
'with_transcription': with_transcription,
'with_summary': with_summary
})
except Exception as e:
app.logger.error(f"Error getting detailed files: {e}")
return jsonify({
'success': False,
'message': f"Error: {str(e)}"
}), 500
return app return app
def get_audio_files() -> List[Dict[str, Any]]: def get_audio_files() -> List[Dict[str, Any]]:
"""Get list of audio files from WebDAV and local""" """Get list of audio files from WebDAV and local"""
import logging
logger = logging.getLogger(__name__)
files = [] files = []
# Get files from WebDAV # Get files from WebDAV
@@ -202,7 +568,7 @@ def get_audio_files() -> List[Dict[str, Any]]:
'available_formats': get_available_formats(base_name) 'available_formats': get_available_formats(base_name)
}) })
except Exception as e: except Exception as e:
app.logger.error(f"Error getting WebDAV files: {e}") logger.warning(f"Error getting WebDAV files: {e}")
# Get local files # Get local files
try: try:
@@ -222,13 +588,113 @@ def get_audio_files() -> List[Dict[str, Any]]:
'available_formats': get_available_formats(file_path.name) 'available_formats': get_available_formats(file_path.name)
}) })
except Exception as e: except Exception as e:
app.logger.error(f"Error getting local files: {e}") logger.error(f"Error getting local files: {e}")
# Remove duplicates (WebDAV takes precedence) # Remove duplicates (keep both local and webdav - distinguish by source)
unique_files = {} unique_files = {}
for file in files: for file in files:
key = file['filename'] # Use (filename, source) as key to keep both local and webdav files
if key not in unique_files or file['source'] == 'webdav': key = (file['filename'], file['source'])
unique_files[key] = file
return sorted(unique_files.values(), key=lambda x: (x['source'], x['filename']))
def get_audio_files_detailed() -> List[Dict[str, Any]]:
"""Get detailed list of audio files with transcription and summary information"""
files = []
# Get local audio files only for detailed view
try:
if settings.LOCAL_DOWNLOADS_PATH.exists():
for ext in settings.AUDIO_EXTENSIONS:
for file_path in settings.LOCAL_DOWNLOADS_PATH.glob(f"*{ext}"):
stat = file_path.stat()
filename = file_path.name
base_name = file_path.stem
# Check for transcription
transcription_path = settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.txt"
has_transcription = transcription_path.exists()
transcription_words = 0
if has_transcription:
try:
with open(transcription_path, 'r', encoding='utf-8') as f:
transcription_text = f.read()
transcription_words = len(transcription_text.split())
except Exception:
pass
# Check for summary and formats
formats = get_available_formats(filename)
has_summary = formats.get('md', False)
# Get summary path
summary_path = None
if has_summary:
summary_variants = [
settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_unificado.md",
settings.LOCAL_DOWNLOADS_PATH / f"{base_name}_unified.md",
settings.LOCAL_DOWNLOADS_PATH / f"{base_name}.md",
]
for variant in summary_variants:
if variant.exists():
summary_path = str(variant)
break
files.append({
'filename': filename,
'base_name': base_name,
'audio_path': str(file_path),
'has_transcription': has_transcription,
'transcription_path': str(transcription_path) if has_transcription else None,
'transcription_words': transcription_words,
'has_summary': has_summary,
'summary_path': summary_path,
'formats': formats,
'processed': processed_registry.is_processed(filename),
'last_modified': datetime.fromtimestamp(stat.st_mtime).strftime('%Y-%m-%d %H:%M:%S'),
'size': format_size(stat.st_size)
})
except Exception as e:
pass # Error logged in endpoint
# Get WebDAV files
if settings.has_webdav_config:
try:
webdav_files = webdav_service.list(settings.REMOTE_AUDIOS_FOLDER)
for file_path in webdav_files:
normalized_path = webdav_service.normalize_path(file_path)
base_name = Path(normalized_path).stem
if any(normalized_path.lower().endswith(ext) for ext in settings.AUDIO_EXTENSIONS):
# Check if already in local files
if not any(f['base_name'] == base_name for f in files):
formats = get_available_formats(base_name)
files.append({
'filename': Path(normalized_path).name,
'base_name': base_name,
'audio_path': normalized_path,
'has_transcription': formats.get('txt', False),
'transcription_path': None,
'transcription_words': 0,
'has_summary': formats.get('md', False),
'summary_path': None,
'formats': formats,
'processed': processed_registry.is_processed(normalized_path),
'last_modified': 'Unknown',
'size': 'Unknown'
})
except Exception as e:
pass # Error logged in endpoint
# Remove duplicates and sort
unique_files = {}
for file in files:
key = file['base_name']
if key not in unique_files:
unique_files[key] = file unique_files[key] = file
return sorted(unique_files.values(), key=lambda x: x['filename']) return sorted(unique_files.values(), key=lambda x: x['filename'])

BIN
kubectl Normal file

Binary file not shown.

48
main.py
View File

@@ -9,10 +9,15 @@ import time
import fcntl import fcntl
import os import os
import json import json
import threading
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
from typing import Optional from typing import Optional
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()
# Configure logging with JSON formatter for production # Configure logging with JSON formatter for production
class JSONFormatter(logging.Formatter): class JSONFormatter(logging.Formatter):
"""JSON formatter for structured logging in production""" """JSON formatter for structured logging in production"""
@@ -234,6 +239,41 @@ def send_error_notification(error_type: str, error_message: str) -> None:
logger.warning(f"Failed to send error notification: {e}") logger.warning(f"Failed to send error notification: {e}")
def run_dashboard_thread() -> None:
"""Run Flask dashboard in a separate thread"""
try:
from api.routes import create_app
app = create_app()
# Run Flask in production mode with threaded=True
app.run(
host='0.0.0.0',
port=5000,
debug=False,
threaded=True,
use_reloader=False # Important: disable reloader in thread
)
except Exception as e:
logger.error(f"Dashboard thread error: {e}")
logger.exception("Dashboard thread exception details")
def start_dashboard() -> threading.Thread:
"""Start dashboard in a background daemon thread"""
dashboard_port = int(os.getenv('DASHBOARD_PORT', '5000'))
logger.info(f"Starting dashboard on port {dashboard_port}...")
# Create daemon thread so it doesn't block shutdown
dashboard_thread = threading.Thread(
target=run_dashboard_thread,
name="DashboardThread",
daemon=True
)
dashboard_thread.start()
logger.info(f"Dashboard thread started (Thread-ID: {dashboard_thread.ident})")
return dashboard_thread
def run_main_loop() -> None: def run_main_loop() -> None:
"""Main processing loop with improved error handling""" """Main processing loop with improved error handling"""
from config import settings from config import settings
@@ -418,13 +458,19 @@ def run_main_loop() -> None:
def main(): def main():
"""Main entry point""" """Main entry point"""
lock_fd = None lock_fd = None
dashboard_thread = None
try: try:
logger.info("=== CBCFacil Service Started ===") logger.info("=== CBCFacil Service Started ===")
logger.info(f"Version: {os.getenv('APP_VERSION', '8.0')}") logger.info(f"Version: {os.getenv('APP_VERSION', '8.0')}")
logger.info(f"Environment: {'production' if os.getenv('DEBUG', 'false').lower() != 'true' else 'development'}") logger.info(f"Environment: {'production' if os.getenv('DEBUG', 'false').lower() != 'true' else 'development'}")
lock_fd = acquire_lock() lock_fd = acquire_lock()
initialize_services() initialize_services()
# Start dashboard in background thread
dashboard_thread = start_dashboard()
# Run main processing loop
run_main_loop() run_main_loop()
except KeyboardInterrupt: except KeyboardInterrupt:

799
plus.md Normal file
View File

@@ -0,0 +1,799 @@
# 🚀 CBCFacil - Mejoras y Extensiones Recomendadas
Documento con recomendaciones para hacer el proyecto más complejo, robusto y profesional.
---
## 📋 Resumen Ejecutivo
Después de analizar todo el proyecto, identifiqué las siguientes áreas principales de mejora:
| Área | Prioridad | Complejidad | Estado Actual |
|------|-----------|-------------|---------------|
| Testing | 🔴 Alta | Media | Solo `conftest.py` existe |
| Frontend Dashboard | 🔴 Alta | Alta | Template básico sin JS |
| Sistema de Colas | 🟡 Media | Alta | Loop síncrono simple |
| Autenticación API | 🔴 Alta | Media | Sin autenticación |
| Base de Datos | 🟡 Media | Media | Solo archivo TXT |
| Métricas/Observabilidad | 🟡 Media | Media | Básico |
| Video Processor | 🟢 Baja | Alta | No existe |
| WebSockets | 🟢 Baja | Media | No existe |
| Internacionalización | 🟢 Baja | Baja | Solo español |
---
## 🧪 1. Testing Completo (CRÍTICO)
### Estado Actual
- Solo existe `tests/conftest.py` y `tests/__init__.py`
- No hay tests unitarios ni de integración implementados
- Arquitectura mencionada en `ARCHITECTURE.md` indica ~60% cobertura (falso)
### Recomendaciones
#### 1.1 Tests Unitarios
```
tests/
├── unit/
│ ├── test_settings.py # Validar configuración
│ ├── test_validators.py # Validar validators.py
│ ├── test_result.py # Patrón Result
│ ├── test_exceptions.py # Excepciones personalizadas
│ ├── test_bloom_filter.py # BloomFilter en registry
│ ├── test_token_bucket.py # Rate limiter
│ └── test_circuit_breaker.py # Circuit breaker
```
#### 1.2 Tests de Integración
```
tests/
├── integration/
│ ├── test_webdav_service.py # Mock de Nextcloud
│ ├── test_telegram_service.py # Mock de Telegram API
│ ├── test_ai_providers.py # Mock de APIs AI
│ ├── test_audio_processor.py # Con audio de prueba
│ ├── test_pdf_processor.py # Con PDF de prueba
│ └── test_document_generator.py
```
#### 1.3 Tests E2E
```
tests/
├── e2e/
│ ├── test_full_audio_workflow.py
│ ├── test_full_pdf_workflow.py
│ └── test_api_endpoints.py
```
#### 1.4 Fixtures de Prueba
```python
# tests/fixtures/
# - sample_audio.mp3 (5 segundos de audio en español)
# - sample_pdf.pdf (2 páginas con texto)
# - expected_transcription.txt
# - expected_summary.md
```
---
## 🖥️ 2. Dashboard Frontend Completo
### Estado Actual
- Solo existe `templates/` con un archivo básico
- API REST sin interfaz visual
- Sin JavaScript interactivo
### Recomendaciones
#### 2.1 Estructura Frontend
```
frontend/
├── src/
│ ├── components/
│ │ ├── FileList.js # Lista de archivos
│ │ ├── FileCard.js # Tarjeta individual
│ │ ├── ProcessingStatus.js # Estado en tiempo real
│ │ ├── GPUMonitor.js # Monitor VRAM
│ │ ├── QueueViewer.js # Cola de procesamiento
│ │ └── NotificationBell.js # Notificaciones
│ ├── pages/
│ │ ├── Dashboard.js # Vista principal
│ │ ├── Files.js # Gestión de archivos
│ │ ├── Settings.js # Configuración
│ │ └── Logs.js # Visor de logs
│ └── services/
│ ├── api.js # Cliente API
│ └── websocket.js # Conexión WS
├── public/
│ └── index.html
└── package.json
```
#### 2.2 Funcionalidades
- [ ] Drag & drop para subir archivos
- [ ] Preview de PDFs y audio
- [ ] Visor de transcripciones lado a lado
- [ ] Editor de resúmenes con Markdown preview
- [ ] Gráficas de uso de GPU/CPU
- [ ] Historial de procesamiento
- [ ] Búsqueda en contenido
- [ ] Dark mode / Light mode
---
## 📬 3. Sistema de Colas (Celery/RQ)
### Estado Actual
- Loop infinito síncrono en `main.py`
- Sin priorización de tareas
- Sin reintentos configurables
- Sin distribución de carga
### Recomendaciones
#### 3.1 Implementar Celery
```python
# services/queue/
__init__.py
celery_app.py # Configuración Celery
tasks/
__init__.py
audio_tasks.py # Tareas de audio
pdf_tasks.py # Tareas de PDF
notification_tasks.py
workers/
worker_config.py
```
#### 3.2 Estructura de Tareas
```python
# tasks/audio_tasks.py
from celery import shared_task
@shared_task(bind=True, max_retries=3, default_retry_delay=60)
def process_audio(self, file_path: str, options: dict) -> dict:
"""Procesar audio con reintentos automáticos"""
...
@shared_task
def transcribe_audio(file_path: str) -> str:
"""Transcribir audio con Whisper"""
...
@shared_task
def generate_summary(transcription: str, base_name: str) -> dict:
"""Generar resumen con IA"""
...
```
#### 3.3 Prioridades de Cola
- `high`: Archivos pequeños (<10MB)
- `default`: Archivos normales
- `low`: Archivos grandes (>100MB)
---
## 🔐 4. Autenticación y Autorización
### Estado Actual
- API completamente abierta
- Sin manejo de sesiones
- Sin roles de usuario
### Recomendaciones
#### 4.1 Implementar JWT
```python
# api/auth/
__init__.py
jwt_handler.py # Generación/validación JWT
middleware.py # Middleware de autenticación
decorators.py # @require_auth, @require_admin
models.py # User, Role, Permission
```
#### 4.2 Endpoints de Auth
```python
# api/routes_auth.py
POST /api/auth/login # Login con usuario/password
POST /api/auth/refresh # Refrescar token
POST /api/auth/logout # Invalidar token
GET /api/auth/me # Perfil del usuario
```
#### 4.3 Roles Sugeridos
- `admin`: Acceso completo
- `processor`: Puede procesar archivos
- `viewer`: Solo lectura
- `api`: Acceso solo API (para integraciones)
---
## 🗄️ 5. Base de Datos (SQLite/PostgreSQL)
### Estado Actual
- Solo `processed_files.txt` como registro
- Sin historial de procesamiento
- Sin metadatos de archivos
### Recomendaciones
#### 5.1 Modelos de Base de Datos
```python
# storage/models/
__init__.py
base.py # SQLAlchemy base
file.py # Modelo File
processing_job.py # Modelo ProcessingJob
user.py # Modelo User
audit_log.py # Modelo AuditLog
```
#### 5.2 Esquema Propuesto
```sql
-- files
CREATE TABLE files (
id SERIAL PRIMARY KEY,
filename VARCHAR(255) NOT NULL,
original_path TEXT,
file_type VARCHAR(20),
file_size BIGINT,
checksum VARCHAR(64),
status VARCHAR(20) DEFAULT 'pending',
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
-- processing_jobs
CREATE TABLE processing_jobs (
id SERIAL PRIMARY KEY,
file_id INTEGER REFERENCES files(id),
job_type VARCHAR(50),
status VARCHAR(20),
started_at TIMESTAMP,
completed_at TIMESTAMP,
error_message TEXT,
result_path TEXT,
metadata JSONB
);
-- audit_logs
CREATE TABLE audit_logs (
id SERIAL PRIMARY KEY,
user_id INTEGER,
action VARCHAR(50),
resource_type VARCHAR(50),
resource_id INTEGER,
details JSONB,
timestamp TIMESTAMP DEFAULT NOW()
);
```
---
## 📊 6. Métricas y Observabilidad
### Estado Actual
- `services/metrics_collector.py` básico
- Sin exportación a sistemas externos
- Sin dashboards de monitoreo
### Recomendaciones
#### 6.1 Prometheus Metrics
```python
# services/observability/
__init__.py
prometheus_exporter.py # Endpoint /metrics
metrics.py # Definición de métricas
tracing.py # Tracing distribuido
```
#### 6.2 Métricas a Implementar
```python
from prometheus_client import Counter, Histogram, Gauge
# Contadores
files_processed_total = Counter('files_processed_total', 'Total files processed', ['type', 'status'])
ai_requests_total = Counter('ai_requests_total', 'AI API requests', ['provider', 'operation'])
# Histogramas
processing_duration = Histogram('processing_duration_seconds', 'Processing time', ['type'])
ai_response_time = Histogram('ai_response_time_seconds', 'AI response time', ['provider'])
# Gauges
active_jobs = Gauge('active_jobs', 'Currently processing jobs')
vram_usage = Gauge('vram_usage_bytes', 'GPU memory usage')
queue_size = Gauge('queue_size', 'Jobs in queue', ['priority'])
```
#### 6.3 Integración
- [ ] Grafana dashboard preconfigurado
- [ ] Alertas con AlertManager
- [ ] Logs estructurados con Loki
- [ ] Tracing con Jaeger/Zipkin
---
## 🎬 7. Video Processor (NUEVO)
### Recomendaciones
#### 7.1 Estructura
```python
# processors/video_processor.py
class VideoProcessor(FileProcessor):
"""Processor for video files"""
def extract_audio(self, video_path: str) -> str:
"""Extraer audio de video con ffmpeg"""
...
def extract_frames(self, video_path: str, interval: int = 60) -> List[str]:
"""Extraer frames cada N segundos para análisis"""
...
def analyze_frames(self, frames: List[str]) -> Dict[str, Any]:
"""Analizar frames con visión AI (Gemini Vision)"""
...
def process(self, file_path: str) -> Dict[str, Any]:
"""Pipeline completo: audio + frames + análisis"""
...
```
#### 7.2 Extensiones de Video
```python
VIDEO_EXTENSIONS = {".mp4", ".avi", ".mkv", ".mov", ".webm"}
```
#### 7.3 Funcionalidades
- [ ] Transcripción de audio del video
- [ ] Extracción de frames clave
- [ ] Análisis visual con IA (slides, pizarra)
- [ ] Generación de índice por escenas
- [ ] Subtítulos automáticos (SRT/VTT)
---
## 🔌 8. WebSockets para Tiempo Real
### Estado Actual
- Solo API REST
- Sin actualizaciones en tiempo real
- Polling pesado para estado
### Recomendaciones
#### 8.1 Implementación
```python
# api/websocket/
__init__.py
manager.py # ConnectionManager
events.py # Tipos de eventos
handlers.py # Event handlers
```
#### 8.2 Eventos a Implementar
```python
# Eventos del servidor -> cliente
{
"type": "file.processing_started",
"data": {"file_id": 1, "filename": "audio.mp3"}
}
{
"type": "file.processing_progress",
"data": {"file_id": 1, "progress": 45, "stage": "transcribing"}
}
{
"type": "file.processing_completed",
"data": {"file_id": 1, "result_path": "/path/to/result.docx"}
}
{
"type": "system.gpu_usage",
"data": {"vram_used": 4.5, "vram_total": 8.0}
}
```
#### 8.3 Integración con Flask
```python
from flask_socketio import SocketIO, emit
socketio = SocketIO(app, cors_allowed_origins="*")
@socketio.on('connect')
def handle_connect():
emit('connected', {'status': 'ok'})
@socketio.on('subscribe')
def handle_subscribe(data):
join_room(data['file_id'])
```
---
## 🌐 9. API versioning y OpenAPI
### Estado Actual
- API sin versionado
- Sin documentación OpenAPI/Swagger
- Endpoints inconsistentes
### Recomendaciones
#### 9.1 Versionado de API
```
/api/v1/files
/api/v1/process
/api/v1/health
/api/v2/files (futura versión)
```
#### 9.2 OpenAPI Spec
```python
# api/openapi/
spec.yaml # Especificación OpenAPI 3.0
swagger_ui.py # Swagger UI integration
# Usar flask-restx o flasgger
from flask_restx import Api, Resource, fields
api = Api(app,
version='1.0',
title='CBCFacil API',
description='API para procesamiento de documentos'
)
```
---
## 🐳 10. Containerización Mejorada
### Estado Actual
- `.dockerignore` existe pero no Dockerfile completo
- Sin docker-compose
- Sin multi-stage builds
### Recomendaciones
#### 10.1 Docker Multi-stage
```dockerfile
# Dockerfile
FROM python:3.11-slim as builder
WORKDIR /app
COPY requirements.txt .
RUN pip wheel --no-cache-dir -w /wheels -r requirements.txt
FROM nvidia/cuda:12.1-runtime-ubuntu22.04 as runtime
# ... instalación optimizada
```
#### 10.2 Docker Compose
```yaml
# docker-compose.yml
version: '3.8'
services:
app:
build: .
ports:
- "5000:5000"
environment:
- NVIDIA_VISIBLE_DEVICES=all
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
redis:
image: redis:7-alpine
celery-worker:
build: .
command: celery -A celery_app worker -l info
depends_on:
- redis
prometheus:
image: prom/prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
grafana:
image: grafana/grafana
ports:
- "3000:3000"
```
---
## 🔧 11. Unificación y Refactoring
### 11.1 AIProvider Unificado
Actualmente existe lógica duplicada entre:
- `services/ai_service.py`
- `services/ai/gemini_provider.py`
- `services/ai/claude_provider.py`
**Recomendación**: Crear interfaz unificada con Chain of Responsibility:
```python
class AIProviderChain:
"""Cadena de proveedores con fallback automático"""
def __init__(self, providers: List[AIProvider]):
self.providers = providers
def generate(self, prompt: str) -> str:
for provider in self.providers:
try:
if provider.is_available():
return provider.generate_text(prompt)
except Exception as e:
logging.warning(f"{provider.name} failed: {e}")
raise AllProvidersFailedError()
```
### 11.2 Procesadores Unificados
Crear pipeline unificado:
```python
class ProcessingPipeline:
def __init__(self):
self.steps = []
def add_step(self, processor: FileProcessor):
self.steps.append(processor)
return self
def process(self, file_path: str) -> Dict[str, Any]:
result = {}
for step in self.steps:
if step.can_process(file_path):
result.update(step.process(file_path))
return result
```
---
## 📱 12. Notificaciones Mejoradas
### Estado Actual
- Solo Telegram
- Sin templates de mensajes
- Sin notificaciones push
### Recomendaciones
#### 12.1 Multi-canal
```python
# services/notifications/
__init__.py
base_notifier.py # Interface base
telegram_notifier.py # Actual optimizado
email_notifier.py # Nuevo
slack_notifier.py # Nuevo
webhook_notifier.py # Para integraciones
notification_manager.py # Orquestador
```
#### 12.2 Templates de Mensaje
```python
TEMPLATES = {
"processing_started": "🎵 Procesando: {filename}\n⏱️ Estimado: {eta}",
"processing_completed": "✅ Completado: {filename}\n📄 Resumen: {summary_url}",
"processing_failed": "❌ Error en {filename}\n🔍 Detalles: {error}",
"daily_summary": "📊 Resumen del día:\n- Procesados: {count}\n- Tiempo total: {time}"
}
```
---
## 🗂️ 13. Sistema de Plugins
### Recomendaciones
```python
# plugins/
__init__.py
base_plugin.py # Interface de plugin
plugin_manager.py # Gestor de plugins
examples/
custom_ocr/ # Plugin OCR personalizado
s3_storage/ # Plugin para AWS S3
discord_notifier/ # Plugin Discord
```
#### Interfaz de Plugin
```python
class BasePlugin(ABC):
"""Base class for plugins"""
@property
@abstractmethod
def name(self) -> str: ...
@property
@abstractmethod
def version(self) -> str: ...
@abstractmethod
def initialize(self, config: dict) -> None: ...
@abstractmethod
def execute(self, context: dict) -> dict: ...
@abstractmethod
def cleanup(self) -> None: ...
```
---
## 📈 14. Mejoras de Rendimiento
### 14.1 Caching Avanzado
```python
# services/cache/
__init__.py
cache_manager.py # Gestor de cache
redis_cache.py # Cache en Redis
file_cache.py # Cache en disco
```
### 14.2 Batch Processing
```python
class BatchProcessor:
"""Procesar múltiples archivos en paralelo"""
def process_batch(self, files: List[str], max_workers: int = 4) -> List[dict]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(self.process_single, f): f for f in files}
results = []
for future in as_completed(futures):
results.append(future.result())
return results
```
### 14.3 Streaming de Archivos Grandes
```python
def stream_process(file_path: str, chunk_size: int = 1024*1024):
"""Procesar archivos grandes en streaming"""
with open(file_path, 'rb') as f:
while chunk := f.read(chunk_size):
yield process_chunk(chunk)
```
---
## 🔒 15. Seguridad Adicional
### 15.1 Validación de Archivos
```python
# services/security/
__init__.py
file_validator.py # Validación de archivos
malware_scanner.py # Escaneo de malware
rate_limiter.py # Rate limiting por IP
```
### 15.2 Checks de Seguridad
- [ ] Validar tipos MIME reales (no solo extensiones)
- [ ] Limitar tamaño máximo de archivo
- [ ] Sanitizar nombres de archivo
- [ ] Escanear con ClamAV
- [ ] Rate limiting por usuario/IP
- [ ] Logs de auditoría
---
## 📝 16. CLI Mejorado
### Estado Actual
- Solo comandos básicos en `main.py`
### Recomendaciones
```python
# cli/
__init__.py
main.py # Click/Typer app
commands/
process.py # cbcfacil process audio.mp3
queue.py # cbcfacil queue list/stats
config.py # cbcfacil config show/set
db.py # cbcfacil db migrate/seed
```
#### Ejemplo con Typer
```python
import typer
app = typer.Typer()
@app.command()
def process(
file: str,
output_dir: str = ".",
format: str = "docx",
ai_provider: str = "auto"
):
"""Procesar archivo de audio o PDF"""
...
@app.command()
def status():
"""Mostrar estado del servicio"""
...
@app.command()
def queue(action: str):
"""Gestionar cola de procesamiento"""
...
```
---
## 📁 Resumen de Nuevos Archivos/Directorios
```
cbc/
├── tests/
│ ├── unit/
│ ├── integration/
│ ├── e2e/
│ └── fixtures/
├── frontend/
│ ├── src/
│ └── public/
├── services/
│ ├── queue/
│ ├── cache/
│ ├── notifications/
│ ├── observability/
│ └── security/
├── api/
│ ├── auth/
│ ├── websocket/
│ └── openapi/
├── storage/
│ └── models/
├── processors/
│ └── video_processor.py
├── plugins/
├── cli/
├── docker-compose.yml
├── prometheus.yml
└── grafana/
```
---
## ✅ Checklist de Implementación
### Fase 1 - Fundamentos (2-3 semanas)
- [ ] Implementar tests unitarios básicos
- [ ] Agregar autenticación JWT
- [ ] Migrar a base de datos SQLite
### Fase 2 - Mejoras Core (3-4 semanas)
- [ ] Implementar sistema de colas con Celery
- [ ] Agregar WebSockets
- [ ] Crear dashboard frontend básico
### Fase 3 - Observabilidad (1-2 semanas)
- [ ] Prometheus metrics
- [ ] Grafana dashboards
- [ ] Logging estructurado
### Fase 4 - Extensiones (2-3 semanas)
- [ ] Video processor
- [ ] Multi-canal de notificaciones
- [ ] Sistema de plugins
### Fase 5 - Producción (2 semanas)
- [ ] Docker compose completo
- [ ] CI/CD pipeline
- [ ] Documentación completa
---
*Documento generado por análisis exhaustivo del proyecto CBCFacil v9*

File diff suppressed because it is too large Load Diff

1198
todo.md Normal file

File diff suppressed because it is too large Load Diff