dosage/tests/test_comics.py

# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher

import re
import os
import multiprocessing
try:
    from urllib.parse import urlsplit
except ImportError:
    from urlparse import urlsplit
from dosagelib import scraper
from . import tmpdir  # noqa


def get_host(url):
    """Get host part of URL."""
    return urlsplit(url)[1].lower()


# Dictionary with per-host locks.
_locks = {}
# Allowed number of connections per host
MaxConnections = 4
# Maximum number of strips to get to test a comic
MaxStrips = 5


def get_lock(host):
    """Get bounded semphore for given host."""
    if host not in _locks:
        _locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)
    return _locks[host]


def _get_saved_images(outdir, scraper):
    """Get saved images."""
    dirs = tuple(scraper.getName().split('/'))
    files = os.listdir(os.path.join(outdir, *dirs))
    files = [x for x in files if not x.endswith(".txt")]
    return files


def test_comicmodule(tmpdir, scraperclass):  # noqa
    '''Test a scraper. It must be able to traverse backward for at least 5
    strips from the start, and find strip images on at least 4 pages.'''
    scraperobj = scraperclass()
    # Limit number of connections to one host.
    host = get_host(scraperobj.url)
    try:
        with get_lock(host):
            _test_comic(tmpdir, scraperobj)
    except OSError:
        # interprocess lock not supported
        _test_comic(tmpdir, scraperobj)


def _test_comic(outdir, scraperobj):
    num_strips = 0
    strip = None
    for strip in scraperobj.getStrips(MaxStrips):
        _check_strip(outdir, strip, scraperobj.multipleImagesPerStrip)

        if num_strips > 0 and scraperobj.prevUrlMatchesStripUrl:
            _check_stripurl(strip, scraperobj)
        num_strips += 1

    if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl:
        # subtract the number of skipped URLs with no image from the expected
        # image number
        num_strips_expected = MaxStrips - len(scraperobj.skippedUrls)
        msg = 'Traversed %d strips instead of %d.' % (num_strips,
                                                      num_strips_expected)
        if strip:
            msg += " Check the prevSearch pattern at %s" % strip.stripUrl
        assert num_strips == num_strips_expected, msg
        if strip:
            _check_scraperesult(outdir, num_strips_expected, strip, scraperobj)


def _check_strip(outdir, strip, multipleImagesPerStrip):
    '''Check that a specific page yields images and the comic module correctly
    declares if there are multiple images per page.'''
    images = []
    for image in strip.getImages():
        images.append(image.url)
        image.save(outdir)
    assert images, 'failed to find images at %s' % strip.stripUrl
    if not multipleImagesPerStrip:
        assert len(images) == 1, 'found more than 1 image at %s: %s' % (
                strip.stripUrl, images)


def _check_scraperesult(outdir, num_images_expected, strip, scraperobj):
    '''Check that exactly or for multiple pages at least num_strips images are
    saved. This checks saved files, ie. it detects duplicate filenames.'''
    saved_images = _get_saved_images(outdir, scraperobj)
    num_images = len(saved_images)

    attrs = (num_images, saved_images, num_images_expected, outdir)
    if scraperobj.multipleImagesPerStrip:
        err = 'saved %d %s instead of at least %d images in %s' % attrs
        assert num_images >= num_images_expected, err
    else:
        err = 'saved %d %s instead of %d images in %s' % attrs
        assert num_images == num_images_expected, err


def _check_stripurl(strip, scraperobj):
    if not scraperobj.stripUrl:
        # no indexing support
        return
    # test that the stripUrl regex matches the retrieved strip URL
    urlmatch = re.escape(scraperobj.stripUrl)
    urlmatch = urlmatch.replace(r"\%s", r".+")
    urlmatch = "^%s$" % urlmatch
    ro = re.compile(urlmatch)
    mo = ro.search(strip.stripUrl)
    err = 'strip URL %r does not match stripUrl pattern %s' % (
            strip.stripUrl, urlmatch)
    assert mo is not None, err


def get_test_scraperclasses():
    """Return scrapers that should be tested."""
    if "TESTALL" in os.environ:
        # test all comics (this will take some time)
        scraperclasses = scraper.get_scraperclasses()
    else:
        if 'TESTCOMICS' in os.environ:
            scraper_pattern = re.compile(os.environ['TESTCOMICS'])
        else:
            # Get limited number of scraper tests on Travis builds to make it
            # faster
            testscrapernames = [
                    'AbstruseGoose',
                    'GoComics/CalvinandHobbes',
                    'xkcd'
            ]
            scraper_pattern = re.compile('|'.join(testscrapernames))

        scraperclasses = [
            scraperclass for scraperclass in scraper.get_scraperclasses()
            if scraper_pattern.match(scraperclass.getName())
        ]
    return scraperclasses


def pytest_generate_tests(metafunc):
    if 'scraperclass' in metafunc.fixturenames:
        metafunc.parametrize('scraperclass', get_test_scraperclasses())
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`# -- coding: utf-8 --`
Updated copyright for all source files. 2012-06-20 20:41:04 +00:00			`# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2014-01-05 15:50:57 +00:00			`# Copyright (C) 2012-2014 Bastian Kleineidam`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`# Copyright (C) 2015-2016 Tobias Gruetzmacher`

Fix some comics. 2012-11-13 18:12:28 +00:00			`import re`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`import os`
Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00			`import multiprocessing`
Fix for python 3.3 2013-11-07 20:22:38 +00:00			`try:`
			`from urllib.parse import urlsplit`
			`except ImportError:`
			`from urlparse import urlsplit`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`from dosagelib import scraper`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`from . import tmpdir # noqa`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00			`def get_host(url):`
			`"""Get host part of URL."""`
Fix for python 3.3 2013-11-07 20:22:38 +00:00			`return urlsplit(url)[1].lower()`
Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00

			`# Dictionary with per-host locks.`
			`_locks = {}`
			`# Allowed number of connections per host`
			`MaxConnections = 4`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`# Maximum number of strips to get to test a comic`
			`MaxStrips = 5`

Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00
			`def get_lock(host):`
			`"""Get bounded semphore for given host."""`
			`if host not in _locks:`
			`_locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)`
			`return _locks[host]`

Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`def _get_saved_images(outdir, scraper):`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`"""Get saved images."""`
			`dirs = tuple(scraper.getName().split('/'))`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`files = os.listdir(os.path.join(outdir, *dirs))`
			`files = [x for x in files if not x.endswith(".txt")]`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`return files`

Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
			`def test_comicmodule(tmpdir, scraperclass): # noqa`
			`'''Test a scraper. It must be able to traverse backward for at least 5`
			`strips from the start, and find strip images on at least 4 pages.'''`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`scraperobj = scraperclass()`
			`# Limit number of connections to one host.`
			`host = get_host(scraperobj.url)`
			`try:`
			`with get_lock(host):`
			`_test_comic(tmpdir, scraperobj)`
			`except OSError:`
			`# interprocess lock not supported`
			`_test_comic(tmpdir, scraperobj)`

Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
			`def _test_comic(outdir, scraperobj):`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`num_strips = 0`
			`strip = None`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`for strip in scraperobj.getStrips(MaxStrips):`
			`_check_strip(outdir, strip, scraperobj.multipleImagesPerStrip)`

Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if num_strips > 0 and scraperobj.prevUrlMatchesStripUrl:`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`_check_stripurl(strip, scraperobj)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`num_strips += 1`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl:`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`# subtract the number of skipped URLs with no image from the expected`
			`# image number`
			`num_strips_expected = MaxStrips - len(scraperobj.skippedUrls)`
			`msg = 'Traversed %d strips instead of %d.' % (num_strips,`
			`num_strips_expected)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if strip:`
			`msg += " Check the prevSearch pattern at %s" % strip.stripUrl`
			`assert num_strips == num_strips_expected, msg`
			`if strip:`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`_check_scraperesult(outdir, num_strips_expected, strip, scraperobj)`


			`def _check_strip(outdir, strip, multipleImagesPerStrip):`
			`'''Check that a specific page yields images and the comic module correctly`
			`declares if there are multiple images per page.'''`
			`images = []`
			`for image in strip.getImages():`
			`images.append(image.url)`
			`image.save(outdir)`
			`assert images, 'failed to find images at %s' % strip.stripUrl`
			`if not multipleImagesPerStrip:`
			`assert len(images) == 1, 'found more than 1 image at %s: %s' % (`
			`strip.stripUrl, images)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
			`def _check_scraperesult(outdir, num_images_expected, strip, scraperobj):`
			`'''Check that exactly or for multiple pages at least num_strips images are`
			`saved. This checks saved files, ie. it detects duplicate filenames.'''`
			`saved_images = _get_saved_images(outdir, scraperobj)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`num_images = len(saved_images)`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
			`attrs = (num_images, saved_images, num_images_expected, outdir)`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if scraperobj.multipleImagesPerStrip:`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`err = 'saved %d %s instead of at least %d images in %s' % attrs`
			`assert num_images >= num_images_expected, err`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`else:`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`err = 'saved %d %s instead of %d images in %s' % attrs`
			`assert num_images == num_images_expected, err`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
			`def _check_stripurl(strip, scraperobj):`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`if not scraperobj.stripUrl:`
			`# no indexing support`
			`return`
			`# test that the stripUrl regex matches the retrieved strip URL`
			`urlmatch = re.escape(scraperobj.stripUrl)`
			`urlmatch = urlmatch.replace(r"\%s", r".+")`
			`urlmatch = "^%s$" % urlmatch`
			`ro = re.compile(urlmatch)`
			`mo = ro.search(strip.stripUrl)`
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00			`err = 'strip URL %r does not match stripUrl pattern %s' % (`
			`strip.stripUrl, urlmatch)`
			`assert mo is not None, err`

Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00
			`def get_test_scraperclasses():`
			`"""Return scrapers that should be tested."""`
Fix random test case. 2013-11-08 17:40:55 +00:00			`if "TESTALL" in os.environ:`
			`# test all comics (this will take some time)`
Limit number of tests for Travis CI. 2013-02-15 19:39:20 +00:00			`scraperclasses = scraper.get_scraperclasses()`
Fix random test case. 2013-11-08 17:40:55 +00:00			`else:`
Allow selection of comics to test via environment. If you need to run test cases for your new comic module, you can call py.test like this: TESTCOMICS="ComicFury/" py.test -v tests/test_comics.py 2016-03-13 12:03:41 +00:00			`if 'TESTCOMICS' in os.environ:`
			`scraper_pattern = re.compile(os.environ['TESTCOMICS'])`
			`else:`
			`# Get limited number of scraper tests on Travis builds to make it`
			`# faster`
			`testscrapernames = [`
			`'AbstruseGoose',`
			`'GoComics/CalvinandHobbes',`
			`'xkcd'`
			`]`
			`scraper_pattern = re.compile('\|'.join(testscrapernames))`

Fix random test case. 2013-11-08 17:40:55 +00:00			`scraperclasses = [`
			`scraperclass for scraperclass in scraper.get_scraperclasses()`
Allow selection of comics to test via environment. If you need to run test cases for your new comic module, you can call py.test like this: TESTCOMICS="ComicFury/" py.test -v tests/test_comics.py 2016-03-13 12:03:41 +00:00			`if scraper_pattern.match(scraperclass.getName())`
Fix random test case. 2013-11-08 17:40:55 +00:00			`]`
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`return scraperclasses`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00
Convert all tests to py.test & cleanups. 2016-03-07 00:08:57 +00:00
Refactor comic module test. All those create-classes-on-the-fly games make my head hurt ;) 2015-07-17 21:33:25 +00:00			`def pytest_generate_tests(metafunc):`
			`if 'scraperclass' in metafunc.fixturenames:`
			`metafunc.parametrize('scraperclass', get_test_scraperclasses())`