- Implementación de detector híbrido (Whisper + Chat + Audio + VLM) - Sistema de detección de gameplay real vs hablando - Scene detection con FFmpeg - Soporte para RTX 3050 y RX 6800 XT - Guía completa en 6800xt.md para próxima IA - Scripts de filtrado visual y análisis de contexto - Pipeline automatizado de generación de videos
56 lines
1.7 KiB
Python
56 lines
1.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Transcribe el audio del stream usando Whisper.
|
|
"""
|
|
import sys
|
|
import json
|
|
import logging
|
|
import whisper
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def transcribe_video(video_path, model_size="base", output_json="transcripcion.json", device="cuda"):
|
|
"""
|
|
Transcribe el video usando Whisper y guarda el resultado con timestamps.
|
|
"""
|
|
logger.info(f"Cargando modelo Whisper ({model_size}) en {device}...")
|
|
model = whisper.load_model(model_size, device=device)
|
|
|
|
logger.info(f"Transcribiendo video: {video_path}")
|
|
result = model.transcribe(
|
|
video_path,
|
|
language="es", # Español (es el streamer de xokas)
|
|
task="transcribe",
|
|
word_timestamps=True, # Importante: timestamps por palabra
|
|
verbose=False
|
|
)
|
|
|
|
# Guardar transcripción completa
|
|
with open(output_json, 'w', encoding='utf-8') as f:
|
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f"Transcripción guardada en {output_json}")
|
|
|
|
# Imprimir resumen
|
|
duration = result.get("segments", [])[-1]["end"] if result.get("segments") else 0
|
|
logger.info(f"Duración transcrita: {duration:.1f}s ({duration/60:.1f} min)")
|
|
logger.info(f"Texto: {len(result.get('text', ''))} caracteres")
|
|
|
|
return result
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--video", required=True)
|
|
parser.add_argument("--output", default="transcripcion.json")
|
|
parser.add_argument("--model", default="base", choices=["tiny", "base", "small", "medium", "large"])
|
|
args = parser.parse_args()
|
|
|
|
transcribe_video(args.video, args.model, args.output)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|