dosage/tests/modules/check_comics.py

# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2021 Tobias Gruetzmacher
import json
import multiprocessing
import os
import re
import warnings
from urllib.parse import urlsplit


# Dictionary with per-host locks.
_locks = {}
# Allowed number of connections per host
MaxConnections = 2
# Maximum number of strips to get to test a comic
MaxStrips = 5
# Match (already-escaped) archive.org URL
ARCHIVE_ORG_MATCH = re.compile(r'(?<=web\\.archive\\.org/web)/\d+/')
# Matches some (maybe-escaped - because Python 2) printf-style format specifiers
PRINTF_MATCH = re.compile(r'\\?%[0-9]*[sd]')
# Classes where the modules are very similar, so that testing the history of
# each modules doesn't make much sense
standarized_modules = {
    'ComicSherpa',
    'ComicsKingdom',
    'GoComics',
    'MangaDex',
    'WebToons',
}
# Already seen classes
seen_modules = set()


def get_lock(host):
    """Get bounded semphore for given host."""
    if host not in _locks:
        _locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)
    return _locks[host]


def test_comicmodule(tmpdir, scraperobj, worker_id):
    '''Test a scraper. It must be able to traverse backward for at least 5
    strips from the start, and find strip images on at least 4 pages.'''
    # Limit number of connections to one host.
    host = urlsplit(scraperobj.url).hostname
    with get_lock(host):
        maxstrips = MaxStrips
        parts = scraperobj.name.split('/', maxsplit=1)
        if len(parts) > 1 and parts[0] in standarized_modules:
            if parts[0] in seen_modules:
                maxstrips = 1
            else:
                seen_modules.add(parts[0])

        _test_comic(str(tmpdir), scraperobj, maxstrips)


def _test_comic(outdir, scraperobj, maxstrips):
    num_strips = 0
    strip = None
    files = []
    PROXYMAP.apply(scraperobj.name)
    for strip in scraperobj.getStrips(maxstrips):
        files.append(_check_strip(outdir, strip,
                                  scraperobj.multipleImagesPerStrip))

        if num_strips > 0:
            _check_stripurl(strip, scraperobj)
        num_strips += 1

    if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl:
        # subtract the number of skipped URLs with no image from the expected
        # image number
        num_strips_expected = maxstrips - len(scraperobj.skippedUrls)
        msg = 'Traversed %d strips instead of %d.' % (num_strips,
                                                      num_strips_expected)
        if strip:
            msg += " Check the prevSearch pattern at %s" % strip.strip_url
        assert num_strips == num_strips_expected, msg
        if strip:
            _check_scraperesult(files, num_strips_expected, strip, scraperobj)


def _check_strip(outdir, strip, multipleImagesPerStrip):
    '''Check that a specific page yields images and the comic module correctly
    declares if there are multiple images per page.'''
    images = []
    files = []
    for image in strip.getImages():
        images.append(image.url)

        # write a fake image (to download less)
        fakeimg = image._fnbase(outdir) + '.fake'
        with open(fakeimg, 'w') as f:
            f.write("fake image for testing")

        fn, _ = image.save(outdir)
        files.append(fn)
    assert images, 'failed to find images at %s' % strip.strip_url
    if not multipleImagesPerStrip:
        assert len(images) == 1, 'found more than 1 image at {}: {}'.format(
            strip.strip_url, images)
    return files


def _check_scraperesult(saved_images, num_images_expected, strip, scraperobj):
    '''Check that exactly or for multiple pages at least num_strips images are
    saved. This checks saved files, ie. it detects duplicate filenames.'''
    num_images = len(saved_images)

    attrs = (num_images, saved_images, num_images_expected)
    if scraperobj.multipleImagesPerStrip:
        err = 'saved %d %s instead of at least %d images' % attrs
        assert num_images >= num_images_expected, err
    else:
        err = 'saved %d %s instead of %d images' % attrs
        assert num_images == num_images_expected, err


def _check_stripurl(strip, scraperobj):
    if not scraperobj.stripUrl:
        # no indexing support
        return
    # test that the stripUrl regex matches the retrieved strip URL
    urlmatch = re.escape(scraperobj.stripUrl)
    urlmatch = PRINTF_MATCH.sub('.+', urlmatch)
    urlmatch = ARCHIVE_ORG_MATCH.sub(r'/\\d+/', urlmatch)
    ro = re.compile(urlmatch)
    mo = ro.match(strip.strip_url)
    if not mo:
        warnings.warn('strip URL {!r} does not match stripUrl pattern {}'.format(
            strip.strip_url, urlmatch))


class ProxyConfig:
    """Loads proxy config from an environment variable and applies it for each test."""
    def __init__(self):
        self.config = {}
        if 'PROXYMAP' in os.environ:
            for regex, server in json.loads(os.environ['PROXYMAP']).items():
                self.config[re.compile(regex)] = server

    def apply(self, name):
        useserver = ''
        for regex, server in self.config.items():
            if regex.match(name):
                useserver = server
                break
        os.environ['http_proxy'] = useserver
        os.environ['https_proxy'] = useserver


# External proxy config to fetch some modules via proxies
PROXYMAP = ProxyConfig()
Update file headers The default encoding for source files is UTF-8 since Python 3, so we can drop all encoding headers. While we are at it, just replace them with SPDX headers. 2020-04-18 11:45:44 +00:00			`# SPDX-License-Identifier: MIT`
Fixup copyright years. 2016-10-28 22:21:41 +00:00			`# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2014-01-05 15:50:57 +00:00			`# Copyright (C) 2012-2014 Bastian Kleineidam`
Speed up comic checks by avoiding redundant tests We don't need to test the "navigation" of each and every comic on the same hoster, if those test give us no new information (this is true for most "modern" hosters which don't allow individual designs/HTML per comic) 2021-01-18 23:20:09 +00:00			`# Copyright (C) 2015-2021 Tobias Gruetzmacher`
Allow proxies for module tests This should make it easier to include geo-blocked comics. 2020-09-28 23:50:02 +00:00			`import json`
Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00			`import multiprocessing`
Allow proxies for module tests This should make it easier to include geo-blocked comics. 2020-09-28 23:50:02 +00:00			`import os`
			`import re`
Make URL differences non-fatal in module tests 2020-09-26 21:08:00 +00:00			`import warnings`
Drop Python 2 support: six & other imports 2020-02-03 00:03:31 +00:00			`from urllib.parse import urlsplit`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00			`# Dictionary with per-host locks.`
			`_locks = {}`
			`# Allowed number of connections per host`
Tests: Keep comics of the same module in the same process This allows our host-based throttling to be effective and keeps cross-process locks to a minimum. 2019-12-03 22:35:41 +00:00			`MaxConnections = 2`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`# Maximum number of strips to get to test a comic`
			`MaxStrips = 5`
Tests: Ignore difference in archive.org snapshots 2020-01-10 13:53:01 +00:00			`# Match (already-escaped) archive.org URL`
			`ARCHIVE_ORG_MATCH = re.compile(r'(?<=web\\.archive\\.org/web)/\d+/')`
			`# Matches some (maybe-escaped - because Python 2) printf-style format specifiers`
			`PRINTF_MATCH = re.compile(r'\\?%[0-9]*[sd]')`
Speed up comic checks by avoiding redundant tests We don't need to test the "navigation" of each and every comic on the same hoster, if those test give us no new information (this is true for most "modern" hosters which don't allow individual designs/HTML per comic) 2021-01-18 23:20:09 +00:00			`# Classes where the modules are very similar, so that testing the history of`
			`# each modules doesn't make much sense`
			`standarized_modules = {`
			`'ComicSherpa',`
			`'ComicsKingdom',`
			`'GoComics',`
			`'MangaDex',`
			`'WebToons',`
			`}`
			`# Already seen classes`
			`seen_modules = set()`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00
			`def get_lock(host):`
			`"""Get bounded semphore for given host."""`
			`if host not in _locks:`
			`_locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)`
			`return _locks[host]`

Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Tests: Keep comics of the same module in the same process This allows our host-based throttling to be effective and keeps cross-process locks to a minimum. 2019-12-03 22:35:41 +00:00			`def test_comicmodule(tmpdir, scraperobj, worker_id):`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`'''Test a scraper. It must be able to traverse backward for at least 5`
			`strips from the start, and find strip images on at least 4 pages.'''`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`# Limit number of connections to one host.`
Tests: Keep comics of the same module in the same process This allows our host-based throttling to be effective and keeps cross-process locks to a minimum. 2019-12-03 22:35:41 +00:00			`host = urlsplit(scraperobj.url).hostname`
			`with get_lock(host):`
Speed up comic checks by avoiding redundant tests We don't need to test the "navigation" of each and every comic on the same hoster, if those test give us no new information (this is true for most "modern" hosters which don't allow individual designs/HTML per comic) 2021-01-18 23:20:09 +00:00			`maxstrips = MaxStrips`
			`parts = scraperobj.name.split('/', maxsplit=1)`
			`if len(parts) > 1 and parts[0] in standarized_modules:`
			`if parts[0] in seen_modules:`
			`maxstrips = 1`
			`else:`
			`seen_modules.add(parts[0])`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Speed up comic checks by avoiding redundant tests We don't need to test the "navigation" of each and every comic on the same hoster, if those test give us no new information (this is true for most "modern" hosters which don't allow individual designs/HTML per comic) 2021-01-18 23:20:09 +00:00			`_test_comic(str(tmpdir), scraperobj, maxstrips)`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Speed up comic checks by avoiding redundant tests We don't need to test the "navigation" of each and every comic on the same hoster, if those test give us no new information (this is true for most "modern" hosters which don't allow individual designs/HTML per comic) 2021-01-18 23:20:09 +00:00
			`def _test_comic(outdir, scraperobj, maxstrips):`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`num_strips = 0`
			`strip = None`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`files = []`
Allow proxies for module tests This should make it easier to include geo-blocked comics. 2020-09-28 23:50:02 +00:00			`PROXYMAP.apply(scraperobj.name)`
Speed up comic checks by avoiding redundant tests We don't need to test the "navigation" of each and every comic on the same hoster, if those test give us no new information (this is true for most "modern" hosters which don't allow individual designs/HTML per comic) 2021-01-18 23:20:09 +00:00			`for strip in scraperobj.getStrips(maxstrips):`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`files.append(_check_strip(outdir, strip,`
			`scraperobj.multipleImagesPerStrip))`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Remove prevUrlMatchesStripUrl. It was only used for one test. 2016-04-15 23:14:26 +00:00			`if num_strips > 0:`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`_check_stripurl(strip, scraperobj)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`num_strips += 1`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl:`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`# subtract the number of skipped URLs with no image from the expected`
			`# image number`
Speed up comic checks by avoiding redundant tests We don't need to test the "navigation" of each and every comic on the same hoster, if those test give us no new information (this is true for most "modern" hosters which don't allow individual designs/HTML per comic) 2021-01-18 23:20:09 +00:00			`num_strips_expected = maxstrips - len(scraperobj.skippedUrls)`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`msg = 'Traversed %d strips instead of %d.' % (num_strips,`
			`num_strips_expected)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if strip:`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`msg += " Check the prevSearch pattern at %s" % strip.strip_url`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`assert num_strips == num_strips_expected, msg`
			`if strip:`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`_check_scraperesult(files, num_strips_expected, strip, scraperobj)`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00

			`def _check_strip(outdir, strip, multipleImagesPerStrip):`
			`'''Check that a specific page yields images and the comic module correctly`
			`declares if there are multiple images per page.'''`
			`images = []`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`files = []`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`for image in strip.getImages():`
			`images.append(image.url)`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00
			`# write a fake image (to download less)`
			`fakeimg = image._fnbase(outdir) + '.fake'`
			`with open(fakeimg, 'w') as f:`
			`f.write("fake image for testing")`

			`fn, _ = image.save(outdir)`
			`files.append(fn)`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`assert images, 'failed to find images at %s' % strip.strip_url`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`if not multipleImagesPerStrip:`
Kenneth Reitz’s Code Style™ See http://docs.python-requests.org/en/master/dev/contributing/#kenneth-reitz-s-code-style Effectively, this removes "visual" indents. 2018-06-29 17:26:17 +00:00			`assert len(images) == 1, 'found more than 1 image at {}: {}'.format(`
			`strip.strip_url, images)`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`return files`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`def _check_scraperesult(saved_images, num_images_expected, strip, scraperobj):`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`'''Check that exactly or for multiple pages at least num_strips images are`
			`saved. This checks saved files, ie. it detects duplicate filenames.'''`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`num_images = len(saved_images)`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`attrs = (num_images, saved_images, num_images_expected)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if scraperobj.multipleImagesPerStrip:`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`err = 'saved %d %s instead of at least %d images' % attrs`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`assert num_images >= num_images_expected, err`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`else:`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`err = 'saved %d %s instead of %d images' % attrs`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`assert num_images == num_images_expected, err`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
			`def _check_stripurl(strip, scraperobj):`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if not scraperobj.stripUrl:`
			`# no indexing support`
			`return`
			`# test that the stripUrl regex matches the retrieved strip URL`
			`urlmatch = re.escape(scraperobj.stripUrl)`
Tests: Ignore difference in archive.org snapshots 2020-01-10 13:53:01 +00:00			`urlmatch = PRINTF_MATCH.sub('.+', urlmatch)`
			`urlmatch = ARCHIVE_ORG_MATCH.sub(r'/\\d+/', urlmatch)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`ro = re.compile(urlmatch)`
Python 2 fix - yes, really 2019-12-27 17:38:09 +00:00			`mo = ro.match(strip.strip_url)`
Make URL differences non-fatal in module tests 2020-09-26 21:08:00 +00:00			`if not mo:`
			`warnings.warn('strip URL {!r} does not match stripUrl pattern {}'.format(`
			`strip.strip_url, urlmatch))`
Allow proxies for module tests This should make it easier to include geo-blocked comics. 2020-09-28 23:50:02 +00:00

			`class ProxyConfig:`
			`"""Loads proxy config from an environment variable and applies it for each test."""`
			`def __init__(self):`
			`self.config = {}`
			`if 'PROXYMAP' in os.environ:`
			`for regex, server in json.loads(os.environ['PROXYMAP']).items():`
			`self.config[re.compile(regex)] = server`

			`def apply(self, name):`
			`useserver = ''`
			`for regex, server in self.config.items():`
			`if regex.match(name):`
			`useserver = server`
			`break`
			`os.environ['http_proxy'] = useserver`
			`os.environ['https_proxy'] = useserver`


			`# External proxy config to fetch some modules via proxies`
			`PROXYMAP = ProxyConfig()`