fix: Detección estricta de archivos ya procesados
- Compara nombres exactos (stem.txt) en vez de substrings - Agrega verificación en callback de descarga - Evita re-procesamiento de archivos Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
32
main.py
32
main.py
@@ -326,6 +326,13 @@ class PollingService:
|
|||||||
|
|
||||||
def _on_file_downloaded(self, file_path: Path) -> None:
|
def _on_file_downloaded(self, file_path: Path) -> None:
|
||||||
"""Callback when a file is downloaded."""
|
"""Callback when a file is downloaded."""
|
||||||
|
# Verificar si ya fue procesado (existe transcripción con nombre exacto)
|
||||||
|
transcriptions_dir = settings.TRANSCRIPTIONS_DIR
|
||||||
|
txt_path = transcriptions_dir / f"{file_path.stem}.txt"
|
||||||
|
if txt_path.exists():
|
||||||
|
logger.info(f"Skipping already processed file: {file_path.name}")
|
||||||
|
return
|
||||||
|
|
||||||
self.queue_file_for_processing(file_path)
|
self.queue_file_for_processing(file_path)
|
||||||
|
|
||||||
def queue_file_for_processing(self, file_path: Path) -> None:
|
def queue_file_for_processing(self, file_path: Path) -> None:
|
||||||
@@ -419,31 +426,18 @@ class PollingService:
|
|||||||
# Extensiones de audio soportadas
|
# Extensiones de audio soportadas
|
||||||
audio_extensions = {".mp3", ".wav", ".m4a", ".mp4", ".webm", ".ogg", ".flac"}
|
audio_extensions = {".mp3", ".wav", ".m4a", ".mp4", ".webm", ".ogg", ".flac"}
|
||||||
|
|
||||||
# Obtener transcripciones existentes para comparar
|
# Obtener transcripciones existentes - verificar por nombre EXACTO
|
||||||
transcriptions_dir = settings.TRANSCRIPTIONS_DIR
|
transcriptions_dir = settings.TRANSCRIPTIONS_DIR
|
||||||
processed_names = set()
|
|
||||||
if transcriptions_dir.exists():
|
|
||||||
for f in transcriptions_dir.iterdir():
|
|
||||||
if f.is_file() and f.suffix == ".txt":
|
|
||||||
# Extraer nombre base sin extensión
|
|
||||||
processed_names.add(f.stem)
|
|
||||||
|
|
||||||
# Filtrar solo archivos que NO han sido procesados
|
# Filtrar solo archivos que NO han sido procesados
|
||||||
pending_files = []
|
pending_files = []
|
||||||
for f in downloads_dir.iterdir():
|
for f in downloads_dir.iterdir():
|
||||||
if f.is_file() and f.suffix.lower() in audio_extensions and not f.name.startswith("."):
|
if f.is_file() and f.suffix.lower() in audio_extensions and not f.name.startswith("."):
|
||||||
# Verificar si ya existe transcripción para este archivo
|
# Verificar si ya existe transcripción con el MISMO nombre
|
||||||
file_stem = f.stem
|
txt_path = transcriptions_dir / f"{f.stem}.txt"
|
||||||
# También verificar versiones con " (copy)" o similar
|
if not txt_path.exists():
|
||||||
if file_stem not in processed_names:
|
# No existe .txt, agregar a pendientes
|
||||||
# Verificar variaciones del nombre
|
pending_files.append(f)
|
||||||
is_processed = False
|
|
||||||
for processed in processed_names:
|
|
||||||
if file_stem in processed or processed in file_stem:
|
|
||||||
is_processed = True
|
|
||||||
break
|
|
||||||
if not is_processed:
|
|
||||||
pending_files.append(f)
|
|
||||||
|
|
||||||
if not pending_files:
|
if not pending_files:
|
||||||
logger.debug("No pending audio files to process")
|
logger.debug("No pending audio files to process")
|
||||||
|
|||||||
@@ -202,23 +202,14 @@ class RemoteFolderWatcher:
|
|||||||
|
|
||||||
# Verificar si ya existe el archivo localmente
|
# Verificar si ya existe el archivo localmente
|
||||||
if local_path.exists():
|
if local_path.exists():
|
||||||
# Verificar si ya fue procesado (existe transcripción)
|
# Verificar si ya fue procesado (existe transcripción con nombre EXACTO)
|
||||||
stem = local_path.stem
|
stem = local_path.stem
|
||||||
transcriptions_dir = self.local_path.parent / "transcriptions"
|
transcriptions_dir = self.local_path.parent / "transcriptions"
|
||||||
processed = False
|
txt_path = transcriptions_dir / f"{stem}.txt"
|
||||||
|
|
||||||
if transcriptions_dir.exists():
|
if txt_path.exists():
|
||||||
for f in transcriptions_dir.iterdir():
|
|
||||||
if f.suffix == ".txt" and stem in f.stem:
|
|
||||||
processed = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if processed:
|
|
||||||
self.logger.info(f"Skipping already processed file: {filename}")
|
self.logger.info(f"Skipping already processed file: {filename}")
|
||||||
return
|
return
|
||||||
else:
|
|
||||||
# Existe pero no procesado, re-descargar
|
|
||||||
self.logger.info(f"Re-downloading incomplete file: {filename}")
|
|
||||||
|
|
||||||
self.logger.info(f"Downloading: {remote_path}")
|
self.logger.info(f"Downloading: {remote_path}")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user