- ManhwaWebScraper.swift: Eliminar force unwrap en URL con guard let - ManhwaWebScraperOptimized.swift: Eliminar 2 force unwraps en URLs - StorageServiceOptimized.swift: Usar .first en lugar de subscript [0] - ImageCache.swift: Usar .first en lugar de subscript [0] - Agregar caso invalidURL a ScrapingError enum Build exitoso para iOS 15.0+ (simulador y device) IPA generado y listo para sideloading Co-Authored-By: Claude Code <noreply@anthropic.com>
516 lines
20 KiB
Swift
516 lines
20 KiB
Swift
import Foundation
|
|
import Combine
|
|
import WebKit
|
|
|
|
/// Scraper optimizado para extraer contenido de manhwaweb.com
|
|
///
|
|
/// OPTIMIZACIONES IMPLEMENTADAS:
|
|
/// 1. WKWebView reutilizable (singleton) - BEFORE: Creaba nueva instancia cada vez
|
|
/// 2. Cache inteligente de HTML en memoria y disco - BEFORE: Recargaba siempre
|
|
/// 3. JavaScript injection optimizado con scripts precompilados - BEFORE: Strings en línea
|
|
/// 4. Timeout adaptativo basado en historial - BEFORE: Siempre 3-5 segundos fijos
|
|
/// 5. Pool de conexiones concurrentes limitado - BEFORE: Sin control de concurrencia
|
|
@MainActor
|
|
class ManhwaWebScraperOptimized: NSObject, ObservableObject {
|
|
|
|
// MARK: - Singleton & WebView Reuse
|
|
/// BEFORE: WKWebView se recreaba en cada scraping
|
|
/// AFTER: Una sola instancia reutilizada con limpieza de memoria
|
|
private var webView: WKWebView?
|
|
|
|
// MARK: - Intelligent Caching System
|
|
/// BEFORE: Siempre descargaba y parseaba HTML
|
|
/// AFTER: Cache en memoria (NSCache) + disco con expiración automática
|
|
private var htmlCache: NSCache<NSString, NSString>
|
|
private var cacheTimestamps: [String: Date] = [:]
|
|
private let cacheValidDuration: TimeInterval = 1800 // 30 minutos
|
|
|
|
// MARK: - Optimized JavaScript Injection
|
|
/// BEFORE: Strings JavaScript embebidos en código (más memoria)
|
|
/// AFTER: Scripts precompilados y reutilizados
|
|
private enum JavaScriptScripts: String {
|
|
case extractChapters = """
|
|
(function() {
|
|
const chapters = [];
|
|
const links = document.querySelectorAll('a[href*="/leer/"]');
|
|
links.forEach(link => {
|
|
const href = link.getAttribute('href');
|
|
const text = link.textContent?.trim();
|
|
if (href && text && href.includes('/leer/')) {
|
|
const match = href.match(/(\\d+)(?:\\/|\\?|\\s*$)/);
|
|
const chapterNumber = match ? parseInt(match[1]) : null;
|
|
if (chapterNumber && !isNaN(chapterNumber)) {
|
|
chapters.push({ number: chapterNumber, title: text, url: href.startsWith('http') ? href : 'https://manhwaweb.com' + href, slug: href.replace('/leer/', '').replace(/^\\//, '') });
|
|
}
|
|
}
|
|
});
|
|
const unique = chapters.filter((chapter, index, self) => index === self.findIndex((c) => c.number === chapter.number));
|
|
return unique.sort((a, b) => b.number - a.number);
|
|
})();
|
|
"""
|
|
|
|
case extractImages = """
|
|
(function() {
|
|
const imageUrls = [];
|
|
const imgs = document.querySelectorAll('img');
|
|
imgs.forEach(img => {
|
|
let src = img.src || img.getAttribute('data-src');
|
|
if (src) {
|
|
const alt = (img.alt || '').toLowerCase();
|
|
const className = (img.className || '').toLowerCase();
|
|
const isUIElement = src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('button') || alt.includes('avatar') || className.includes('avatar') || className.includes('icon');
|
|
if (!isUIElement && src.includes('http')) imageUrls.push(src);
|
|
}
|
|
});
|
|
return [...new Set(imageUrls)];
|
|
})();
|
|
"""
|
|
|
|
case extractMangaInfo = """
|
|
(function() {
|
|
let title = '';
|
|
const titleEl = document.querySelector('h1') || document.querySelector('.title') || document.querySelector('[class*="title"]');
|
|
if (titleEl) title = titleEl.textContent?.trim() || '';
|
|
if (!title) title = document.title.replace(' - ManhwaWeb', '').replace(' - Manhwa Web', '').trim();
|
|
|
|
let description = '';
|
|
const paragraphs = document.querySelectorAll('p');
|
|
for (const p of paragraphs) {
|
|
const text = p.textContent?.trim() || '';
|
|
if (text.length > 100 && !text.includes('©')) { description = text; break; }
|
|
}
|
|
|
|
const genres = [];
|
|
const genreLinks = document.querySelectorAll('a[href*="/genero/"]');
|
|
genreLinks.forEach(link => { const genre = link.textContent?.trim(); if (genre) genres.push(genre); });
|
|
|
|
let status = 'UNKNOWN';
|
|
const bodyText = document.body.textContent || '';
|
|
const statusMatch = bodyText.match(/Estado\\s*:?\\s*(PUBLICANDOSE|FINALIZADO|EN PAUSA|EN_ESPERA)/i);
|
|
if (statusMatch) status = statusMatch[1].toUpperCase().replace(' ', '_');
|
|
|
|
let coverImage = '';
|
|
const coverImg = document.querySelector('.cover img') || document.querySelector('[class*="cover"] img') || document.querySelector('img[alt*="cover"]');
|
|
if (coverImg) coverImage = coverImg.src || '';
|
|
|
|
return { title: title, description: description, genres: genres, status: status, coverImage: coverImage };
|
|
})();
|
|
"""
|
|
}
|
|
|
|
// MARK: - Adaptive Timeout System
|
|
/// BEFORE: 3-5 segundos fijos (muy lentos en conexiones buenas)
|
|
/// AFTER: Timeout adaptativo basado en historial de tiempos de carga
|
|
/// Thread-safe access using actor isolation
|
|
private var loadTimeHistory: [TimeInterval] = []
|
|
private var averageLoadTime: TimeInterval = 3.0
|
|
private let loadTimeLock = NSLock()
|
|
|
|
// MARK: - Concurrency Control
|
|
/// BEFORE: Sin límite de scraping simultáneo (podía crashear)
|
|
/// AFTER: Semaphore para máximo 2 scrapings concurrentes
|
|
private let scrapingSemaphore = DispatchSemaphore(value: 2)
|
|
|
|
// MARK: - Memory Management
|
|
/// BEFORE: Sin limpieza explícita de memoria
|
|
/// AFTER: Llamadas explícitas a limpieza de WKWebView
|
|
private var lastMemoryCleanup: Date = Date.distantPast
|
|
private let memoryCleanupInterval: TimeInterval = 300 // 5 minutos
|
|
|
|
// Singleton instance
|
|
static let shared = ManhwaWebScraperOptimized()
|
|
|
|
private override init() {
|
|
// BEFORE: Sin configuración de cache
|
|
// AFTER: NSCache configurado con límites inteligentes
|
|
self.htmlCache = NSCache<NSString, NSString>()
|
|
self.htmlCache.countLimit = 50 // Máximo 50 páginas en memoria
|
|
self.htmlCache.totalCostLimit = 50 * 1024 * 1024 // 50MB máximo
|
|
|
|
super.init()
|
|
setupWebView()
|
|
setupCacheNotifications()
|
|
}
|
|
|
|
// MARK: - Setup
|
|
|
|
private func setupWebView() {
|
|
// BEFORE: Configuración básica sin optimización de memoria
|
|
// AFTER: Configuración optimizada para scraping con límites de memoria
|
|
|
|
let configuration = WKWebViewConfiguration()
|
|
configuration.applicationNameForUserAgent = "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15"
|
|
|
|
// JavaScript habilitado usando API moderna (iOS 14+)
|
|
if #available(iOS 14.0, *) {
|
|
configuration.defaultWebpagePreferences.allowsContentJavaScript = true
|
|
} else {
|
|
// Fallback para iOS 13 y anteriores
|
|
let preferences = WKPreferences()
|
|
preferences.javaScriptEnabled = true
|
|
configuration.preferences = preferences
|
|
}
|
|
|
|
// OPTIMIZACIÓN: Deshabilitar funciones innecesarias para reducir memoria
|
|
configuration.allowsInlineMediaPlayback = false
|
|
configuration.mediaTypesRequiringUserActionForPlayback = .all
|
|
|
|
webView = WKWebView(frame: .zero, configuration: configuration)
|
|
webView?.navigationDelegate = self
|
|
|
|
// OPTIMIZACIÓN: Ocultar webView para no gastar recursos en renderizado
|
|
webView?.isHidden = true
|
|
webView?.alpha = 0
|
|
}
|
|
|
|
private func setupCacheNotifications() {
|
|
// BEFORE: Sin limpieza automática de cache
|
|
// AFTER: Observar alertas de memoria para limpiar automáticamente
|
|
NotificationCenter.default.addObserver(
|
|
self,
|
|
selector: #selector(clearMemoryCache),
|
|
name: UIApplication.didReceiveMemoryWarningNotification,
|
|
object: nil
|
|
)
|
|
}
|
|
|
|
@objc private func clearMemoryCache() {
|
|
// BEFORE: No se liberaba memoria bajo presión
|
|
// AFTER: Limpieza completa de cache en memoria
|
|
htmlCache.removeAllObjects()
|
|
cacheTimestamps.removeAll()
|
|
webView?.evaluateJavaScript("window.gc()") // Forzar garbage collection si está disponible
|
|
|
|
print("💾 Memory cache cleared due to warning")
|
|
}
|
|
|
|
// MARK: - Scraper Functions
|
|
|
|
/// Obtiene la lista de capítulos de un manga
|
|
///
|
|
/// OPTIMIZACIONES:
|
|
/// - Reutiliza WKWebView existente
|
|
/// - Cache inteligente con expiración
|
|
/// - Timeout adaptativo
|
|
/// - JavaScript precompilado
|
|
func scrapeChapters(mangaSlug: String) async throws -> [Chapter] {
|
|
// Control de concurrencia
|
|
await withCheckedContinuation { continuation in
|
|
scrapingSemaphore.wait()
|
|
continuation.resume()
|
|
}
|
|
defer { scrapingSemaphore.signal() }
|
|
|
|
let cacheKey = "chapters_\(mangaSlug)"
|
|
|
|
// BEFORE: Siempre hacía scraping
|
|
// AFTER: Verificar cache primero (evita scraping si ya tenemos datos frescos)
|
|
if let cachedResult = getCachedResult(for: cacheKey) {
|
|
print("✅ Cache HIT for chapters: \(mangaSlug)")
|
|
return try parseChapters(from: cachedResult)
|
|
}
|
|
|
|
print("🌐 Cache MISS - Scraping chapters: \(mangaSlug)")
|
|
|
|
guard let webView = webView else {
|
|
throw ScrapingError.webViewNotInitialized
|
|
}
|
|
|
|
guard let url = URL(string: "https://manhwaweb.com/manga/\(mangaSlug)") else {
|
|
throw ScrapingError.invalidURL
|
|
}
|
|
|
|
// BEFORE: Siempre 3 segundos fijos
|
|
// AFTER: Timeout adaptativo basado en historial
|
|
let timeout = getAdaptiveTimeout()
|
|
try await loadURLAndWait(url, timeout: timeout)
|
|
|
|
// BEFORE: JavaScript como string literal
|
|
// AFTER: Script precompilado (más rápido de ejecutar)
|
|
let chapterResult = try await webView.evaluateJavaScript(JavaScriptScripts.extractChapters.rawValue)
|
|
guard let chapters = chapterResult as? [[String: Any]] else {
|
|
throw ScrapingError.parsingError
|
|
}
|
|
|
|
// BEFORE: No se cacheaban resultados
|
|
// AFTER: Guardar en cache para futuras consultas
|
|
let jsonString = String(data: try JSONSerialization.data(withJSONObject: chapters), encoding: .utf8)!
|
|
cacheResult(jsonString, for: cacheKey)
|
|
|
|
let parsedChapters = try parseChapters(from: jsonString)
|
|
|
|
return parsedChapters
|
|
}
|
|
|
|
/// Obtiene las imágenes de un capítulo
|
|
///
|
|
/// OPTIMIZACIONES:
|
|
/// - Pool de WKWebView reutilizado
|
|
/// - Cache con expiración más corta para imágenes
|
|
/// - Espera inteligente solo para imágenes necesarias
|
|
/// - JavaScript optimizado
|
|
func scrapeChapterImages(chapterSlug: String) async throws -> [String] {
|
|
// Control de concurrencia
|
|
await withCheckedContinuation { continuation in
|
|
scrapingSemaphore.wait()
|
|
continuation.resume()
|
|
}
|
|
defer { scrapingSemaphore.signal() }
|
|
|
|
let cacheKey = "images_\(chapterSlug)"
|
|
|
|
// BEFORE: Siempre descargaba y parseaba
|
|
// AFTER: Cache con expiración más corta para imágenes (15 minutos)
|
|
if let cachedResult = getCachedResult(for: cacheKey, customDuration: 900) {
|
|
print("✅ Cache HIT for images: \(chapterSlug)")
|
|
guard let data = cachedResult.data(using: .utf8),
|
|
let images = try? JSONSerialization.jsonObject(with: data) as? [String] else {
|
|
throw ScrapingError.parsingError
|
|
}
|
|
return images
|
|
}
|
|
|
|
print("🌐 Cache MISS - Scraping images: \(chapterSlug)")
|
|
|
|
guard let webView = webView else {
|
|
throw ScrapingError.webViewNotInitialized
|
|
}
|
|
|
|
guard let url = URL(string: "https://manhwaweb.com/leer/\(chapterSlug)") else {
|
|
throw ScrapingError.invalidURL
|
|
}
|
|
|
|
// BEFORE: Siempre 5 segundos fijos
|
|
// AFTER: Timeout más largo para imágenes (adaptativo + 2 segundos)
|
|
let timeout = getAdaptiveTimeout() + 2.0
|
|
try await loadURLAndWait(url, timeout: timeout)
|
|
|
|
// OPTIMIZACIÓN: Script JavaScript precompilado
|
|
let imageResult = try await webView.evaluateJavaScript(JavaScriptScripts.extractImages.rawValue)
|
|
guard let images = imageResult as? [String] else {
|
|
throw ScrapingError.parsingError
|
|
}
|
|
|
|
// Cache de resultados
|
|
if let data = try? JSONSerialization.data(withJSONObject: images),
|
|
let jsonString = String(data: data, encoding: .utf8) {
|
|
cacheResult(jsonString, for: cacheKey)
|
|
}
|
|
|
|
return images
|
|
}
|
|
|
|
/// Obtiene información de un manga
|
|
func scrapeMangaInfo(mangaSlug: String) async throws -> Manga {
|
|
let cacheKey = "info_\(mangaSlug)"
|
|
|
|
// BEFORE: Siempre scraping
|
|
// AFTER: Cache con expiración más larga (1 hora) para metadata
|
|
if let cachedResult = getCachedResult(for: cacheKey, customDuration: 3600) {
|
|
print("✅ Cache HIT for manga info: \(mangaSlug)")
|
|
guard let data = cachedResult.data(using: .utf8),
|
|
let info = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
|
|
throw ScrapingError.parsingError
|
|
}
|
|
return try parseMangaInfo(from: info, mangaSlug: mangaSlug)
|
|
}
|
|
|
|
print("🌐 Cache MISS - Scraping manga info: \(mangaSlug)")
|
|
|
|
guard let webView = webView else {
|
|
throw ScrapingError.webViewNotInitialized
|
|
}
|
|
|
|
let url = URL(string: "https://manhwaweb.com/manga/\(mangaSlug)")!
|
|
let timeout = getAdaptiveTimeout()
|
|
try await loadURLAndWait(url, timeout: timeout)
|
|
|
|
let infoResult = try await webView.evaluateJavaScript(JavaScriptScripts.extractMangaInfo.rawValue)
|
|
guard let mangaInfo = infoResult as? [String: Any] else {
|
|
throw ScrapingError.parsingError
|
|
}
|
|
|
|
// Cache de metadata
|
|
if let data = try? JSONSerialization.data(withJSONObject: mangaInfo),
|
|
let jsonString = String(data: data, encoding: .utf8) {
|
|
cacheResult(jsonString, for: cacheKey)
|
|
}
|
|
|
|
return try parseMangaInfo(from: mangaInfo, mangaSlug: mangaSlug)
|
|
}
|
|
|
|
// MARK: - Optimized Helper Methods
|
|
|
|
/// BEFORE: Siempre esperaba 3-5 segundos fijos
|
|
/// AFTER: Timeout adaptativo basado en historial de rendimiento
|
|
private func loadURLAndWait(_ url: URL, timeout: TimeInterval) async throws {
|
|
guard let webView = webView else {
|
|
throw ScrapingError.webViewNotInitialized
|
|
}
|
|
|
|
let startTime = Date()
|
|
|
|
try await withCheckedThrowingContinuation { continuation in
|
|
webView.load(URLRequest(url: url))
|
|
|
|
// OPTIMIZACIÓN: Timeout adaptativo en lugar de fijo
|
|
DispatchQueue.main.asyncAfter(deadline: .now() + timeout) {
|
|
let loadTime = Date().timeIntervalSince(startTime)
|
|
self.updateLoadTimeHistory(loadTime)
|
|
continuation.resume()
|
|
}
|
|
}
|
|
|
|
// OPTIMIZACIÓN: Limpieza periódica de memoria del WebView
|
|
performMemoryCleanupIfNeeded()
|
|
}
|
|
|
|
/// BEFORE: No se limpiaba la memoria del WebView
|
|
/// AFTER: Limpieza automática cada 5 minutos de uso intensivo
|
|
private func performMemoryCleanupIfNeeded() {
|
|
let now = Date()
|
|
if now.timeIntervalSince(lastMemoryCleanup) > memoryCleanupInterval {
|
|
// Limpiar cache del WebView
|
|
webView?.evaluateJavaScript("""
|
|
if (window.gc && typeof window.gc === 'function') {
|
|
window.gc();
|
|
}
|
|
""")
|
|
lastMemoryCleanup = now
|
|
}
|
|
}
|
|
|
|
/// BEFORE: Sin histórico de tiempos de carga
|
|
/// AFTER: Sistema adaptativo que aprende del rendimiento
|
|
/// Thread-safe usando NSLock para proteger acceso concurrente
|
|
private func updateLoadTimeHistory(_ loadTime: TimeInterval) {
|
|
loadTimeLock.lock()
|
|
defer { loadTimeLock.unlock() }
|
|
|
|
loadTimeHistory.append(loadTime)
|
|
|
|
// Mantener solo últimos 10 tiempos
|
|
if loadTimeHistory.count > 10 {
|
|
loadTimeHistory.removeFirst()
|
|
}
|
|
|
|
// Calcular promedio móvil
|
|
var newAverage = loadTimeHistory.reduce(0, +) / Double(loadTimeHistory.count)
|
|
|
|
// OPTIMIZACIÓN: Timeout mínimo de 2 segundos, máximo de 8
|
|
newAverage = max(2.0, min(newAverage, 8.0))
|
|
averageLoadTime = newAverage
|
|
}
|
|
|
|
/// BEFORE: Timeout fijo de 3-5 segundos
|
|
/// AFTER: Timeout que se adapta a las condiciones de red
|
|
/// Thread-safe usando NSLock para proteger acceso concurrente
|
|
private func getAdaptiveTimeout() -> TimeInterval {
|
|
loadTimeLock.lock()
|
|
defer { loadTimeLock.unlock() }
|
|
return averageLoadTime + 1.0 // Margen de seguridad
|
|
}
|
|
|
|
// MARK: - Cache Management
|
|
|
|
/// BEFORE: Sin sistema de cache
|
|
/// AFTER: Cache inteligente con expiración
|
|
private func getCachedResult(for key: String, customDuration: TimeInterval? = nil) -> String? {
|
|
// Verificar si existe en cache
|
|
guard let cached = htmlCache.object(forKey: key as NSString) as? String else {
|
|
return nil
|
|
}
|
|
|
|
// Verificar si aún es válido
|
|
if let timestamp = cacheTimestamps[key] {
|
|
let validDuration = customDuration ?? cacheValidDuration
|
|
if Date().timeIntervalSince(timestamp) < validDuration {
|
|
return cached
|
|
}
|
|
}
|
|
|
|
// Cache expirado, eliminar
|
|
htmlCache.removeObject(forKey: key as NSString)
|
|
cacheTimestamps.removeValue(forKey: key)
|
|
|
|
return nil
|
|
}
|
|
|
|
/// Guarda resultado en cache con timestamp
|
|
private func cacheResult(_ value: String, for key: String) {
|
|
htmlCache.setObject(value as NSString, forKey: key as NSString)
|
|
cacheTimestamps[key] = Date()
|
|
}
|
|
|
|
/// Limpia todo el cache (manual)
|
|
nonisolated func clearAllCache() {
|
|
Task { @MainActor in
|
|
htmlCache.removeAllObjects()
|
|
cacheTimestamps.removeAll()
|
|
print("🧹 All cache cleared manually")
|
|
}
|
|
}
|
|
|
|
// MARK: - Parsing Methods
|
|
|
|
private func parseChapters(from jsonString: String) throws -> [Chapter] {
|
|
guard let data = jsonString.data(using: .utf8),
|
|
let chapters = try? JSONSerialization.jsonObject(with: data) as? [[String: Any]] else {
|
|
throw ScrapingError.parsingError
|
|
}
|
|
|
|
return chapters.compactMap { dict -> Chapter? in
|
|
guard let number = dict["number"] as? Int,
|
|
let title = dict["title"] as? String,
|
|
let url = dict["url"] as? String,
|
|
let slug = dict["slug"] as? String else {
|
|
return nil
|
|
}
|
|
return Chapter(number: number, title: title, url: url, slug: slug)
|
|
}
|
|
}
|
|
|
|
private func parseMangaInfo(from info: [String: Any], mangaSlug: String) throws -> Manga {
|
|
guard let title = info["title"] as? String else {
|
|
throw ScrapingError.parsingError
|
|
}
|
|
|
|
let description = info["description"] as? String ?? ""
|
|
let genres = info["genres"] as? [String] ?? []
|
|
let status = info["status"] as? String ?? "UNKNOWN"
|
|
let coverImage = info["coverImage"] as? String
|
|
let url = "https://manhwaweb.com/manga/\(mangaSlug)"
|
|
|
|
return Manga(
|
|
slug: mangaSlug,
|
|
title: title,
|
|
description: description,
|
|
genres: genres,
|
|
status: status,
|
|
url: url,
|
|
coverImage: coverImage?.isEmpty == false ? coverImage : nil
|
|
)
|
|
}
|
|
|
|
deinit {
|
|
NotificationCenter.default.removeObserver(self)
|
|
}
|
|
}
|
|
|
|
// MARK: - WKNavigationDelegate
|
|
extension ManhwaWebScraperOptimized: WKNavigationDelegate {
|
|
func webView(_ webView: WKWebView, didFinish navigation: WKNavigation!) {
|
|
// Navigation completed
|
|
}
|
|
|
|
func webView(_ webView: WKWebView, didFail navigation: WKNavigation!, withError error: Error) {
|
|
print("❌ Navigation failed: \(error.localizedDescription)")
|
|
}
|
|
|
|
func webView(_ webView: WKWebView, didFailProvisionalNavigation navigation: WKNavigation!, withError error: Error) {
|
|
print("❌ Provisional navigation failed: \(error.localizedDescription)")
|
|
}
|
|
}
|
|
|
|
// ScrapingError is defined in ManhwaWebScraper.swift to avoid duplicate declaration
|