2020-04-18 11:45:44 +00:00
|
|
|
# SPDX-License-Identifier: MIT
|
2016-10-28 22:21:41 +00:00
|
|
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
2014-01-05 15:50:57 +00:00
|
|
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
2021-01-18 23:20:09 +00:00
|
|
|
# Copyright (C) 2015-2021 Tobias Gruetzmacher
|
2020-09-28 23:50:02 +00:00
|
|
|
import json
|
2013-02-28 20:08:43 +00:00
|
|
|
import multiprocessing
|
2020-09-28 23:50:02 +00:00
|
|
|
import os
|
|
|
|
import re
|
2020-09-26 21:08:00 +00:00
|
|
|
import warnings
|
2020-02-03 00:03:31 +00:00
|
|
|
from urllib.parse import urlsplit
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
2013-02-28 20:08:43 +00:00
|
|
|
# Dictionary with per-host locks.
|
|
|
|
_locks = {}
|
|
|
|
# Allowed number of connections per host
|
2019-12-03 22:35:41 +00:00
|
|
|
MaxConnections = 2
|
2016-03-07 00:08:57 +00:00
|
|
|
# Maximum number of strips to get to test a comic
|
|
|
|
MaxStrips = 5
|
2020-01-10 13:53:01 +00:00
|
|
|
# Match (already-escaped) archive.org URL
|
|
|
|
ARCHIVE_ORG_MATCH = re.compile(r'(?<=web\\.archive\\.org/web)/\d+/')
|
|
|
|
# Matches some (maybe-escaped - because Python 2) printf-style format specifiers
|
|
|
|
PRINTF_MATCH = re.compile(r'\\?%[0-9]*[sd]')
|
2021-01-18 23:20:09 +00:00
|
|
|
# Classes where the modules are very similar, so that testing the history of
|
|
|
|
# each modules doesn't make much sense
|
|
|
|
standarized_modules = {
|
|
|
|
'ComicSherpa',
|
|
|
|
'ComicsKingdom',
|
|
|
|
'GoComics',
|
|
|
|
'MangaDex',
|
|
|
|
'WebToons',
|
|
|
|
}
|
|
|
|
# Already seen classes
|
|
|
|
seen_modules = set()
|
2016-03-07 00:08:57 +00:00
|
|
|
|
2013-02-28 20:08:43 +00:00
|
|
|
|
|
|
|
def get_lock(host):
|
|
|
|
"""Get bounded semphore for given host."""
|
|
|
|
if host not in _locks:
|
|
|
|
_locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)
|
|
|
|
return _locks[host]
|
|
|
|
|
2015-07-17 21:33:25 +00:00
|
|
|
|
2019-12-03 22:35:41 +00:00
|
|
|
def test_comicmodule(tmpdir, scraperobj, worker_id):
|
2016-03-07 00:08:57 +00:00
|
|
|
'''Test a scraper. It must be able to traverse backward for at least 5
|
|
|
|
strips from the start, and find strip images on at least 4 pages.'''
|
2015-07-17 21:33:25 +00:00
|
|
|
# Limit number of connections to one host.
|
2019-12-03 22:35:41 +00:00
|
|
|
host = urlsplit(scraperobj.url).hostname
|
|
|
|
with get_lock(host):
|
2021-01-18 23:20:09 +00:00
|
|
|
maxstrips = MaxStrips
|
|
|
|
parts = scraperobj.name.split('/', maxsplit=1)
|
|
|
|
if len(parts) > 1 and parts[0] in standarized_modules:
|
|
|
|
if parts[0] in seen_modules:
|
|
|
|
maxstrips = 1
|
|
|
|
else:
|
|
|
|
seen_modules.add(parts[0])
|
2015-07-17 21:33:25 +00:00
|
|
|
|
2021-01-18 23:20:09 +00:00
|
|
|
_test_comic(str(tmpdir), scraperobj, maxstrips)
|
2016-03-07 00:08:57 +00:00
|
|
|
|
2021-01-18 23:20:09 +00:00
|
|
|
|
|
|
|
def _test_comic(outdir, scraperobj, maxstrips):
|
2015-07-17 21:33:25 +00:00
|
|
|
num_strips = 0
|
|
|
|
strip = None
|
2016-07-31 22:44:34 +00:00
|
|
|
files = []
|
2020-09-28 23:50:02 +00:00
|
|
|
PROXYMAP.apply(scraperobj.name)
|
2021-01-18 23:20:09 +00:00
|
|
|
for strip in scraperobj.getStrips(maxstrips):
|
2016-07-31 22:44:34 +00:00
|
|
|
files.append(_check_strip(outdir, strip,
|
|
|
|
scraperobj.multipleImagesPerStrip))
|
2016-03-07 00:08:57 +00:00
|
|
|
|
2016-04-15 23:14:26 +00:00
|
|
|
if num_strips > 0:
|
2016-03-07 00:08:57 +00:00
|
|
|
_check_stripurl(strip, scraperobj)
|
2015-07-17 21:33:25 +00:00
|
|
|
num_strips += 1
|
2016-03-07 00:08:57 +00:00
|
|
|
|
2015-07-17 21:33:25 +00:00
|
|
|
if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl:
|
2016-03-07 00:08:57 +00:00
|
|
|
# subtract the number of skipped URLs with no image from the expected
|
|
|
|
# image number
|
2021-01-18 23:20:09 +00:00
|
|
|
num_strips_expected = maxstrips - len(scraperobj.skippedUrls)
|
2016-03-07 00:08:57 +00:00
|
|
|
msg = 'Traversed %d strips instead of %d.' % (num_strips,
|
|
|
|
num_strips_expected)
|
2015-07-17 21:33:25 +00:00
|
|
|
if strip:
|
2016-04-18 22:32:25 +00:00
|
|
|
msg += " Check the prevSearch pattern at %s" % strip.strip_url
|
2015-07-17 21:33:25 +00:00
|
|
|
assert num_strips == num_strips_expected, msg
|
|
|
|
if strip:
|
2016-07-31 22:44:34 +00:00
|
|
|
_check_scraperesult(files, num_strips_expected, strip, scraperobj)
|
2016-03-07 00:08:57 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _check_strip(outdir, strip, multipleImagesPerStrip):
|
|
|
|
'''Check that a specific page yields images and the comic module correctly
|
|
|
|
declares if there are multiple images per page.'''
|
|
|
|
images = []
|
2016-07-31 22:44:34 +00:00
|
|
|
files = []
|
2016-03-07 00:08:57 +00:00
|
|
|
for image in strip.getImages():
|
|
|
|
images.append(image.url)
|
2016-07-31 22:44:34 +00:00
|
|
|
|
|
|
|
# write a fake image (to download less)
|
|
|
|
fakeimg = image._fnbase(outdir) + '.fake'
|
|
|
|
with open(fakeimg, 'w') as f:
|
|
|
|
f.write("fake image for testing")
|
|
|
|
|
|
|
|
fn, _ = image.save(outdir)
|
|
|
|
files.append(fn)
|
2016-04-18 22:32:25 +00:00
|
|
|
assert images, 'failed to find images at %s' % strip.strip_url
|
2016-03-07 00:08:57 +00:00
|
|
|
if not multipleImagesPerStrip:
|
2018-06-29 17:26:17 +00:00
|
|
|
assert len(images) == 1, 'found more than 1 image at {}: {}'.format(
|
|
|
|
strip.strip_url, images)
|
2016-07-31 22:44:34 +00:00
|
|
|
return files
|
2015-07-17 21:33:25 +00:00
|
|
|
|
2016-03-07 00:08:57 +00:00
|
|
|
|
2016-07-31 22:44:34 +00:00
|
|
|
def _check_scraperesult(saved_images, num_images_expected, strip, scraperobj):
|
2016-03-07 00:08:57 +00:00
|
|
|
'''Check that exactly or for multiple pages at least num_strips images are
|
|
|
|
saved. This checks saved files, ie. it detects duplicate filenames.'''
|
2015-07-17 21:33:25 +00:00
|
|
|
num_images = len(saved_images)
|
2016-03-07 00:08:57 +00:00
|
|
|
|
2016-07-31 22:44:34 +00:00
|
|
|
attrs = (num_images, saved_images, num_images_expected)
|
2015-07-17 21:33:25 +00:00
|
|
|
if scraperobj.multipleImagesPerStrip:
|
2016-07-31 22:44:34 +00:00
|
|
|
err = 'saved %d %s instead of at least %d images' % attrs
|
2016-03-07 00:08:57 +00:00
|
|
|
assert num_images >= num_images_expected, err
|
2015-07-17 21:33:25 +00:00
|
|
|
else:
|
2016-07-31 22:44:34 +00:00
|
|
|
err = 'saved %d %s instead of %d images' % attrs
|
2016-03-07 00:08:57 +00:00
|
|
|
assert num_images == num_images_expected, err
|
2015-07-17 21:33:25 +00:00
|
|
|
|
2016-03-07 00:08:57 +00:00
|
|
|
|
|
|
|
def _check_stripurl(strip, scraperobj):
|
2015-07-17 21:33:25 +00:00
|
|
|
if not scraperobj.stripUrl:
|
|
|
|
# no indexing support
|
|
|
|
return
|
|
|
|
# test that the stripUrl regex matches the retrieved strip URL
|
|
|
|
urlmatch = re.escape(scraperobj.stripUrl)
|
2020-01-10 13:53:01 +00:00
|
|
|
urlmatch = PRINTF_MATCH.sub('.+', urlmatch)
|
|
|
|
urlmatch = ARCHIVE_ORG_MATCH.sub(r'/\\d+/', urlmatch)
|
2015-07-17 21:33:25 +00:00
|
|
|
ro = re.compile(urlmatch)
|
2019-12-27 17:38:09 +00:00
|
|
|
mo = ro.match(strip.strip_url)
|
2020-09-26 21:08:00 +00:00
|
|
|
if not mo:
|
|
|
|
warnings.warn('strip URL {!r} does not match stripUrl pattern {}'.format(
|
|
|
|
strip.strip_url, urlmatch))
|
2020-09-28 23:50:02 +00:00
|
|
|
|
|
|
|
|
|
|
|
class ProxyConfig:
|
|
|
|
"""Loads proxy config from an environment variable and applies it for each test."""
|
|
|
|
def __init__(self):
|
|
|
|
self.config = {}
|
|
|
|
if 'PROXYMAP' in os.environ:
|
|
|
|
for regex, server in json.loads(os.environ['PROXYMAP']).items():
|
|
|
|
self.config[re.compile(regex)] = server
|
|
|
|
|
|
|
|
def apply(self, name):
|
|
|
|
useserver = ''
|
|
|
|
for regex, server in self.config.items():
|
|
|
|
if regex.match(name):
|
|
|
|
useserver = server
|
|
|
|
break
|
|
|
|
os.environ['http_proxy'] = useserver
|
|
|
|
os.environ['https_proxy'] = useserver
|
|
|
|
|
|
|
|
|
|
|
|
# External proxy config to fetch some modules via proxies
|
|
|
|
PROXYMAP = ProxyConfig()
|