Speed up comic checks by avoiding redundant tests

We don't need to test the "navigation" of each and every comic on the
same hoster, if those test give us no new information (this is true for
most "modern" hosters which don't allow individual designs/HTML per
comic)
This commit is contained in:
Tobias Gruetzmacher 2021-01-19 00:20:09 +01:00
parent 3d05e59c36
commit 0428fd52b3

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2021 Tobias Gruetzmacher
import json import json
import multiprocessing import multiprocessing
import os import os
@ -20,6 +20,17 @@ MaxStrips = 5
ARCHIVE_ORG_MATCH = re.compile(r'(?<=web\\.archive\\.org/web)/\d+/') ARCHIVE_ORG_MATCH = re.compile(r'(?<=web\\.archive\\.org/web)/\d+/')
# Matches some (maybe-escaped - because Python 2) printf-style format specifiers # Matches some (maybe-escaped - because Python 2) printf-style format specifiers
PRINTF_MATCH = re.compile(r'\\?%[0-9]*[sd]') PRINTF_MATCH = re.compile(r'\\?%[0-9]*[sd]')
# Classes where the modules are very similar, so that testing the history of
# each modules doesn't make much sense
standarized_modules = {
'ComicSherpa',
'ComicsKingdom',
'GoComics',
'MangaDex',
'WebToons',
}
# Already seen classes
seen_modules = set()
def get_lock(host): def get_lock(host):
@ -35,15 +46,23 @@ def test_comicmodule(tmpdir, scraperobj, worker_id):
# Limit number of connections to one host. # Limit number of connections to one host.
host = urlsplit(scraperobj.url).hostname host = urlsplit(scraperobj.url).hostname
with get_lock(host): with get_lock(host):
_test_comic(str(tmpdir), scraperobj) maxstrips = MaxStrips
parts = scraperobj.name.split('/', maxsplit=1)
if len(parts) > 1 and parts[0] in standarized_modules:
if parts[0] in seen_modules:
maxstrips = 1
else:
seen_modules.add(parts[0])
_test_comic(str(tmpdir), scraperobj, maxstrips)
def _test_comic(outdir, scraperobj): def _test_comic(outdir, scraperobj, maxstrips):
num_strips = 0 num_strips = 0
strip = None strip = None
files = [] files = []
PROXYMAP.apply(scraperobj.name) PROXYMAP.apply(scraperobj.name)
for strip in scraperobj.getStrips(MaxStrips): for strip in scraperobj.getStrips(maxstrips):
files.append(_check_strip(outdir, strip, files.append(_check_strip(outdir, strip,
scraperobj.multipleImagesPerStrip)) scraperobj.multipleImagesPerStrip))
@ -54,7 +73,7 @@ def _test_comic(outdir, scraperobj):
if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl: if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl:
# subtract the number of skipped URLs with no image from the expected # subtract the number of skipped URLs with no image from the expected
# image number # image number
num_strips_expected = MaxStrips - len(scraperobj.skippedUrls) num_strips_expected = maxstrips - len(scraperobj.skippedUrls)
msg = 'Traversed %d strips instead of %d.' % (num_strips, msg = 'Traversed %d strips instead of %d.' % (num_strips,
num_strips_expected) num_strips_expected)
if strip: if strip: