twitch-highlight-detector/transcribe_with_whisper.py

#!/usr/bin/env python3
"""
Transcribe el audio del stream usando Whisper.
"""
import sys
import json
import logging
import whisper
import numpy as np
from pathlib import Path

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def transcribe_video(video_path, model_size="base", output_json="transcripcion.json", device="cuda"):
    """
    Transcribe el video usando Whisper y guarda el resultado con timestamps.
    """
    logger.info(f"Cargando modelo Whisper ({model_size}) en {device}...")
    model = whisper.load_model(model_size, device=device)

    logger.info(f"Transcribiendo video: {video_path}")
    result = model.transcribe(
        video_path,
        language="es",  # Español (es el streamer de xokas)
        task="transcribe",
        word_timestamps=True,  # Importante: timestamps por palabra
        verbose=False
    )

    # Guardar transcripción completa
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    logger.info(f"Transcripción guardada en {output_json}")

    # Imprimir resumen
    duration = result.get("segments", [])[-1]["end"] if result.get("segments") else 0
    logger.info(f"Duración transcrita: {duration:.1f}s ({duration/60:.1f} min)")
    logger.info(f"Texto: {len(result.get('text', ''))} caracteres")

    return result

def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--video", required=True)
    parser.add_argument("--output", default="transcripcion.json")
    parser.add_argument("--model", default="base", choices=["tiny", "base", "small", "medium", "large"])
    args = parser.parse_args()

    transcribe_video(args.video, args.model, args.output)

if __name__ == "__main__":
    main()