fix: Detección estricta de archivos ya procesados

- Compara nombres exactos (stem.txt) en vez de substrings - Agrega verificación en callback de descarga - Evita re-procesamiento de archivos Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 17:28:04 +00:00
parent 1f6bfa771b
commit d902203b59
2 changed files with 16 additions and 31 deletions
--- a/main.py
+++ b/main.py
@@ -326,6 +326,13 @@ class PollingService:

    def _on_file_downloaded(self, file_path: Path) -> None:
        """Callback when a file is downloaded."""
+        # Verificar si ya fue procesado (existe transcripción con nombre exacto)
+        transcriptions_dir = settings.TRANSCRIPTIONS_DIR
+        txt_path = transcriptions_dir / f"{file_path.stem}.txt"
+        if txt_path.exists():
+            logger.info(f"Skipping already processed file: {file_path.name}")
+            return
+
        self.queue_file_for_processing(file_path)

    def queue_file_for_processing(self, file_path: Path) -> None:
@@ -419,31 +426,18 @@ class PollingService:
        # Extensiones de audio soportadas
        audio_extensions = {".mp3", ".wav", ".m4a", ".mp4", ".webm", ".ogg", ".flac"}

-        # Obtener transcripciones existentes para comparar
+        # Obtener transcripciones existentes - verificar por nombre EXACTO
        transcriptions_dir = settings.TRANSCRIPTIONS_DIR
-        processed_names = set()
-        if transcriptions_dir.exists():
-            for f in transcriptions_dir.iterdir():
-                if f.is_file() and f.suffix == ".txt":
-                    # Extraer nombre base sin extensión
-                    processed_names.add(f.stem)

        # Filtrar solo archivos que NO han sido procesados
        pending_files = []
        for f in downloads_dir.iterdir():
            if f.is_file() and f.suffix.lower() in audio_extensions and not f.name.startswith("."):
-                # Verificar si ya existe transcripción para este archivo
-                file_stem = f.stem
-                # También verificar versiones con " (copy)" o similar
-                if file_stem not in processed_names:
-                    # Verificar variaciones del nombre
-                    is_processed = False
-                    for processed in processed_names:
-                        if file_stem in processed or processed in file_stem:
-                            is_processed = True
-                            break
-                    if not is_processed:
-                        pending_files.append(f)
+                # Verificar si ya existe transcripción con el MISMO nombre
+                txt_path = transcriptions_dir / f"{f.stem}.txt"
+                if not txt_path.exists():
+                    # No existe .txt, agregar a pendientes
+                    pending_files.append(f)

        if not pending_files:
            logger.debug("No pending audio files to process")
--- a/watchers/folder_watcher.py
+++ b/watchers/folder_watcher.py
@@ -202,23 +202,14 @@ class RemoteFolderWatcher:

        # Verificar si ya existe el archivo localmente
        if local_path.exists():
-            # Verificar si ya fue procesado (existe transcripción)
+            # Verificar si ya fue procesado (existe transcripción con nombre EXACTO)
            stem = local_path.stem
            transcriptions_dir = self.local_path.parent / "transcriptions"
-            processed = False
+            txt_path = transcriptions_dir / f"{stem}.txt"

-            if transcriptions_dir.exists():
-                for f in transcriptions_dir.iterdir():
-                    if f.suffix == ".txt" and stem in f.stem:
-                        processed = True
-                        break
-
-            if processed:
+            if txt_path.exists():
                self.logger.info(f"Skipping already processed file: {filename}")
                return
-            else:
-                # Existe pero no procesado, re-descargar
-                self.logger.info(f"Re-downloading incomplete file: {filename}")

        self.logger.info(f"Downloading: {remote_path}")