#!/usr/bin/env python3 """ Transcribe el audio del stream usando Whisper. """ import sys import json import logging import whisper import numpy as np from pathlib import Path logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def transcribe_video(video_path, model_size="base", output_json="transcripcion.json", device="cuda"): """ Transcribe el video usando Whisper y guarda el resultado con timestamps. """ logger.info(f"Cargando modelo Whisper ({model_size}) en {device}...") model = whisper.load_model(model_size, device=device) logger.info(f"Transcribiendo video: {video_path}") result = model.transcribe( video_path, language="es", # Español (es el streamer de xokas) task="transcribe", word_timestamps=True, # Importante: timestamps por palabra verbose=False ) # Guardar transcripción completa with open(output_json, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) logger.info(f"Transcripción guardada en {output_json}") # Imprimir resumen duration = result.get("segments", [])[-1]["end"] if result.get("segments") else 0 logger.info(f"Duración transcrita: {duration:.1f}s ({duration/60:.1f} min)") logger.info(f"Texto: {len(result.get('text', ''))} caracteres") return result def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--video", required=True) parser.add_argument("--output", default="transcripcion.json") parser.add_argument("--model", default="base", choices=["tiny", "base", "small", "medium", "large"]) args = parser.parse_args() transcribe_video(args.video, args.model, args.output) if __name__ == "__main__": main()