dosage/tests/modules/check_comics.py

# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher

from __future__ import absolute_import, division, print_function

import re
import multiprocessing
from six.moves.urllib.parse import urlsplit


# Dictionary with per-host locks.
_locks = {}
# Allowed number of connections per host
MaxConnections = 2
# Maximum number of strips to get to test a comic
MaxStrips = 5


def get_lock(host):
    """Get bounded semphore for given host."""
    if host not in _locks:
        _locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)
    return _locks[host]


def test_comicmodule(tmpdir, scraperobj, worker_id):
    '''Test a scraper. It must be able to traverse backward for at least 5
    strips from the start, and find strip images on at least 4 pages.'''
    # Limit number of connections to one host.
    host = urlsplit(scraperobj.url).hostname
    with get_lock(host):
        _test_comic(str(tmpdir), scraperobj)


def _test_comic(outdir, scraperobj):
    num_strips = 0
    strip = None
    files = []
    for strip in scraperobj.getStrips(MaxStrips):
        files.append(_check_strip(outdir, strip,
                                  scraperobj.multipleImagesPerStrip))

        if num_strips > 0:
            _check_stripurl(strip, scraperobj)
        num_strips += 1

    if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl:
        # subtract the number of skipped URLs with no image from the expected
        # image number
        num_strips_expected = MaxStrips - len(scraperobj.skippedUrls)
        msg = 'Traversed %d strips instead of %d.' % (num_strips,
                                                      num_strips_expected)
        if strip:
            msg += " Check the prevSearch pattern at %s" % strip.strip_url
        assert num_strips == num_strips_expected, msg
        if strip:
            _check_scraperesult(files, num_strips_expected, strip, scraperobj)


def _check_strip(outdir, strip, multipleImagesPerStrip):
    '''Check that a specific page yields images and the comic module correctly
    declares if there are multiple images per page.'''
    images = []
    files = []
    for image in strip.getImages():
        images.append(image.url)

        # write a fake image (to download less)
        fakeimg = image._fnbase(outdir) + '.fake'
        with open(fakeimg, 'w') as f:
            f.write("fake image for testing")

        fn, _ = image.save(outdir)
        files.append(fn)
    assert images, 'failed to find images at %s' % strip.strip_url
    if not multipleImagesPerStrip:
        assert len(images) == 1, 'found more than 1 image at {}: {}'.format(
            strip.strip_url, images)
    return files


def _check_scraperesult(saved_images, num_images_expected, strip, scraperobj):
    '''Check that exactly or for multiple pages at least num_strips images are
    saved. This checks saved files, ie. it detects duplicate filenames.'''
    num_images = len(saved_images)

    attrs = (num_images, saved_images, num_images_expected)
    if scraperobj.multipleImagesPerStrip:
        err = 'saved %d %s instead of at least %d images' % attrs
        assert num_images >= num_images_expected, err
    else:
        err = 'saved %d %s instead of %d images' % attrs
        assert num_images == num_images_expected, err


def _check_stripurl(strip, scraperobj):
    if not scraperobj.stripUrl:
        # no indexing support
        return
    # test that the stripUrl regex matches the retrieved strip URL
    urlmatch = re.escape(scraperobj.stripUrl)
    urlmatch = urlmatch.replace(r"%s", r".+")
    urlmatch = "^%s$" % urlmatch
    ro = re.compile(urlmatch)
    mo = ro.search(strip.strip_url)
    err = 'strip URL {!r} does not match stripUrl pattern {}'.format(
        strip.strip_url, urlmatch)
    assert mo is not None, err
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`# -- coding: utf-8 --`
Fixup copyright years. 2016-10-28 22:21:41 +00:00			`# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2014-01-05 15:50:57 +00:00			`# Copyright (C) 2012-2014 Bastian Kleineidam`
Tests: Keep comics of the same module in the same process This allows our host-based throttling to be effective and keeps cross-process locks to a minimum. 2019-12-03 22:35:41 +00:00			`# Copyright (C) 2015-2019 Tobias Gruetzmacher`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`from __future__ import absolute_import, division, print_function`

Fix some comics. 2012-11-13 18:12:28 +00:00			`import re`
Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00			`import multiprocessing`
Replace home-grown Python2/3 compat. with six. 2016-05-05 21:33:48 +00:00			`from six.moves.urllib.parse import urlsplit`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00			`# Dictionary with per-host locks.`
			`_locks = {}`
			`# Allowed number of connections per host`
Tests: Keep comics of the same module in the same process This allows our host-based throttling to be effective and keeps cross-process locks to a minimum. 2019-12-03 22:35:41 +00:00			`MaxConnections = 2`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`# Maximum number of strips to get to test a comic`
			`MaxStrips = 5`

Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00
			`def get_lock(host):`
			`"""Get bounded semphore for given host."""`
			`if host not in _locks:`
			`_locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)`
			`return _locks[host]`

Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Tests: Keep comics of the same module in the same process This allows our host-based throttling to be effective and keeps cross-process locks to a minimum. 2019-12-03 22:35:41 +00:00			`def test_comicmodule(tmpdir, scraperobj, worker_id):`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`'''Test a scraper. It must be able to traverse backward for at least 5`
			`strips from the start, and find strip images on at least 4 pages.'''`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`# Limit number of connections to one host.`
Tests: Keep comics of the same module in the same process This allows our host-based throttling to be effective and keeps cross-process locks to a minimum. 2019-12-03 22:35:41 +00:00			`host = urlsplit(scraperobj.url).hostname`
			`with get_lock(host):`
Use py.test's tmpdir fixture. 2016-03-28 14:29:57 +00:00			`_test_comic(str(tmpdir), scraperobj)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
			`def _test_comic(outdir, scraperobj):`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`num_strips = 0`
			`strip = None`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`files = []`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`for strip in scraperobj.getStrips(MaxStrips):`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`files.append(_check_strip(outdir, strip,`
			`scraperobj.multipleImagesPerStrip))`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Remove prevUrlMatchesStripUrl. It was only used for one test. 2016-04-15 23:14:26 +00:00			`if num_strips > 0:`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`_check_stripurl(strip, scraperobj)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`num_strips += 1`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl:`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`# subtract the number of skipped URLs with no image from the expected`
			`# image number`
			`num_strips_expected = MaxStrips - len(scraperobj.skippedUrls)`
			`msg = 'Traversed %d strips instead of %d.' % (num_strips,`
			`num_strips_expected)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if strip:`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`msg += " Check the prevSearch pattern at %s" % strip.strip_url`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`assert num_strips == num_strips_expected, msg`
			`if strip:`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`_check_scraperesult(files, num_strips_expected, strip, scraperobj)`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00

			`def _check_strip(outdir, strip, multipleImagesPerStrip):`
			`'''Check that a specific page yields images and the comic module correctly`
			`declares if there are multiple images per page.'''`
			`images = []`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`files = []`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`for image in strip.getImages():`
			`images.append(image.url)`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00
			`# write a fake image (to download less)`
			`fakeimg = image._fnbase(outdir) + '.fake'`
			`with open(fakeimg, 'w') as f:`
			`f.write("fake image for testing")`

			`fn, _ = image.save(outdir)`
			`files.append(fn)`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`assert images, 'failed to find images at %s' % strip.strip_url`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`if not multipleImagesPerStrip:`
Kenneth Reitz’s Code Style™ See http://docs.python-requests.org/en/master/dev/contributing/#kenneth-reitz-s-code-style Effectively, this removes "visual" indents. 2018-06-29 17:26:17 +00:00			`assert len(images) == 1, 'found more than 1 image at {}: {}'.format(`
			`strip.strip_url, images)`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`return files`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`def _check_scraperesult(saved_images, num_images_expected, strip, scraperobj):`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`'''Check that exactly or for multiple pages at least num_strips images are`
			`saved. This checks saved files, ie. it detects duplicate filenames.'''`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`num_images = len(saved_images)`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`attrs = (num_images, saved_images, num_images_expected)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if scraperobj.multipleImagesPerStrip:`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`err = 'saved %d %s instead of at least %d images' % attrs`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`assert num_images >= num_images_expected, err`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`else:`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`err = 'saved %d %s instead of %d images' % attrs`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`assert num_images == num_images_expected, err`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
			`def _check_stripurl(strip, scraperobj):`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if not scraperobj.stripUrl:`
			`# no indexing support`
			`return`
			`# test that the stripUrl regex matches the retrieved strip URL`
			`urlmatch = re.escape(scraperobj.stripUrl)`
Fix urlmatch test 2019-12-04 23:58:31 +00:00			`urlmatch = urlmatch.replace(r"%s", r".+")`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`urlmatch = "^%s$" % urlmatch`
			`ro = re.compile(urlmatch)`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`mo = ro.search(strip.strip_url)`
Kenneth Reitz’s Code Style™ See http://docs.python-requests.org/en/master/dev/contributing/#kenneth-reitz-s-code-style Effectively, this removes "visual" indents. 2018-06-29 17:26:17 +00:00			`err = 'strip URL {!r} does not match stripUrl pattern {}'.format(`
			`strip.strip_url, urlmatch)`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`assert mo is not None, err`