twitch-highlight-detector/run_vlm_analysis.py

#!/opt/vlm_env/bin/python3
"""
VLM GAMEPLAY DETECTOR - Nivel Senior
Usa Moondream 2B local en GPU para detectar gameplay real de LoL
"""

import sys

sys.path.insert(0, "/opt/vlm_env/lib/python3.13/site-packages")

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import subprocess
import json
from pathlib import Path
import time

print("=" * 70)
print("🎮 VLM GAMEPLAY DETECTOR - Moondream 2B (Local GPU)")
print("=" * 70)
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
print()

# Cargar modelo Moondream
print("📥 Cargando Moondream 2B en GPU...")
model_id = "vikhyatk/moondream2"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, torch_dtype=torch.float16, device_map={"": "cuda"}
)
print("✅ Modelo cargado y listo")
print()


def analyze_frame(image_path, timestamp):
    """Analiza un frame con Moondream VLM."""
    try:
        image = Image.open(image_path)

        # Prompt específico para League of Legends
        prompt = """Look at this image from a gaming stream. Is this showing:
1. ACTIVE GAMEPLAY - League of Legends match in progress (map visible, champions fighting, abilities being used)
2. CHAMPION SELECT - Lobby or selection screen
3. STREAMER TALKING - Just the streamer face/webcam without game visible
4. MENU/WAITING - Game menus, loading screens, or waiting

Answer with ONLY ONE word: GAMEPLAY, SELECT, TALKING, or MENU"""

        # Encode image
        enc_image = model.encode_image(image)

        # Query
        answer = model.answer_question(enc_image, prompt, tokenizer)
        result = answer.strip().upper()

        # Determinar si es gameplay
        is_gameplay = "GAMEPLAY" in result

        return {
            "timestamp": timestamp,
            "is_gameplay": is_gameplay,
            "classification": result,
            "confidence": "HIGH" if is_gameplay else "LOW",
        }
    except Exception as e:
        print(f"  Error en {timestamp}s: {e}")
        return None


# Analizar video
video_path = (
    "/home/ren/proyectos/editor/twitch-highlight-detector/nuevo_stream_360p.mp4"
)

# Obtener duración
result = subprocess.run(
    [
        "ffprobe",
        "-v",
        "error",
        "-show_entries",
        "format=duration",
        "-of",
        "default=noprint_wrappers=1:nokey=1",
        video_path,
    ],
    capture_output=True,
    text=True,
)

duration = float(result.stdout.strip())
print(f"📹 Video: {duration / 60:.1f} minutos ({duration / 3600:.1f} horas)")
print("🔍 Analizando cada 30 segundos con VLM...")
print("   (Esto tomará ~10-15 minutos)")
print()

# Analizar cada 30 segundos
check_interval = 30
timestamps = list(range(455, int(duration), check_interval))

segments = []
in_gameplay = False
start_ts = None
start_time = time.time()

for i, ts in enumerate(timestamps):
    mins = ts // 60
    secs = ts % 60

    # Extraer frame
    frame_path = f"/tmp/vlm_frame_{ts}.jpg"
    subprocess.run(
        [
            "ffmpeg",
            "-y",
            "-i",
            video_path,
            "-ss",
            str(ts),
            "-vframes",
            "1",
            "-vf",
            "scale=512:288",  # Tamaño suficiente para VLM
            "-q:v",
            "3",
            frame_path,
        ],
        capture_output=True,
    )

    if not Path(frame_path).exists():
        continue

    # Analizar con VLM
    analysis = analyze_frame(frame_path, ts)

    if analysis:
        icon = "🎮" if analysis["is_gameplay"] else "🗣️"
        print(f"{mins:02d}:{secs:02d} {icon} {analysis['classification']}")

        # Detectar segmentos
        if analysis["is_gameplay"]:
            if not in_gameplay:
                start_ts = ts
                in_gameplay = True
                print(f"   └─ INICIO gameplay")
        else:
            if in_gameplay and start_ts:
                seg_duration = ts - start_ts
                if seg_duration > 60:  # Mínimo 1 minuto
                    segments.append(
                        {"start": start_ts, "end": ts, "duration": seg_duration}
                    )
                    print(
                        f"   └─ FIN gameplay ({seg_duration // 60}m {seg_duration % 60}s)"
                    )
                in_gameplay = False
                start_ts = None

    # Limpiar
    Path(frame_path).unlink(missing_ok=True)

    # Progreso cada 10 frames
    if (i + 1) % 10 == 0:
        elapsed = time.time() - start_time
        remaining = (elapsed / (i + 1)) * (len(timestamps) - i - 1)
        print(
            f"\n   Progreso: {i + 1}/{len(timestamps)} frames | "
            f"Tiempo restante: {remaining // 60:.0f}m {remaining % 60:.0f}s\n"
        )

# Cerrar último
if in_gameplay and start_ts:
    segments.append(
        {"start": start_ts, "end": int(duration), "duration": int(duration) - start_ts}
    )

# Resultados
print(f"\n{'=' * 70}")
print(f"✅ ANÁLISIS VLM COMPLETADO")
print(f"{'=' * 70}")
print(f"Segmentos de gameplay: {len(segments)}")
total_gameplay = sum(s["duration"] for s in segments)
print(f"Tiempo total gameplay: {total_gameplay // 60}m {total_gameplay % 60}s")
print(f"Tiempo total hablando/otros: {(int(duration) - 455 - total_gameplay) // 60}m")
print()

for i, seg in enumerate(segments, 1):
    mins_s, secs_s = divmod(seg["start"], 60)
    mins_e, secs_e = divmod(seg["end"], 60)
    hours_s = mins_s // 60
    hours_e = mins_e // 60
    print(
        f"{i}. {hours_s}h{mins_s % 60:02d}m - {hours_e}h{mins_e % 60:02d}m "
        f"({seg['duration'] // 60}m {seg['duration'] % 60}s)"
    )

# Guardar
output_file = (
    "/home/ren/proyectos/editor/twitch-highlight-detector/gameplay_vlm_zones.json"
)
with open(output_file, "w") as f:
    json.dump(segments, f, indent=2)

print(f"\n💾 Guardado: {output_file}")
print(f"\nAhora puedes filtrar highlights usando estos rangos exactos.")