dosage/tests/test_comics.py

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
import tempfile
import shutil
import re
import os
import multiprocessing
try:
    from urllib.parse import urlsplit
except ImportError:
    from urlparse import urlsplit
from unittest import TestCase
from dosagelib import scraper


def get_host(url):
    """Get host part of URL."""
    return urlsplit(url)[1].lower()


# Dictionary with per-host locks.
_locks = {}
# Allowed number of connections per host
MaxConnections = 4

def get_lock(host):
    """Get bounded semphore for given host."""
    if host not in _locks:
        _locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)
    return _locks[host]


class _ComicTester(TestCase):
    """Basic comic test class."""
    scraperclass=None

    def setUp(self):
        if self.scraperclass is not None:
            self.name = self.scraperclass.getName()
            self.url = self.scraperclass.starter()
            # create a temporary directory for images
            self.tmpdir = tempfile.mkdtemp()
        else:
            self.tmpdir = None

    def tearDown(self):
        if self.tmpdir is not None:
            shutil.rmtree(self.tmpdir)

    def get_saved_images(self, filtertxt=False):
        """Get saved images."""
        dirs = tuple(self.name.split('/'))
        files = os.listdir(os.path.join(self.tmpdir, *dirs))
        if filtertxt:
            files = [x for x in files if not x.endswith(".txt")]
        return files

    def test_comic(self):
        if self.scraperclass is None:
            # only run subclasses
            import pytest
            pytest.skip("base class")
        # Test a scraper. It must be able to traverse backward for
        # at least 5 strips from the start, and find strip images
        # on at least 4 pages.
        scraperobj = self.scraperclass()
        # Limit number of connections to one host.
        host = get_host(scraperobj.url)
        try:
            with get_lock(host):
                self._test_comic(scraperobj)
        except OSError:
            # interprocess lock not supported
            self._test_comic(scraperobj)

    def _test_comic(self, scraperobj):
        num_strips = 0
        max_strips = 5
        strip = None
        for strip in scraperobj.getStrips(max_strips):
            images = []
            for image in strip.getImages():
                images.append(image.url)
                self.save(image)
            self.check(images, 'failed to find images at %s' % strip.stripUrl)
            if not self.scraperclass.multipleImagesPerStrip:
                self.check(len(images) == 1, 'found more than 1 image at %s: %s' % (strip.stripUrl, images))
            if num_strips > 0 and self.scraperclass.prevUrlMatchesStripUrl:
                self.check_stripurl(strip)
            num_strips += 1
        if self.scraperclass.prevSearch and not scraperobj.hitFirstStripUrl:
            # check strips
            num_strips_expected = max_strips - len(scraperobj.skippedUrls)
            msg = 'Traversed %d strips instead of %d.' % (num_strips, num_strips_expected)
            if strip:
                msg += " Check the prevSearch pattern at %s" % strip.stripUrl
            self.check(num_strips == num_strips_expected, msg)
            # check images
            if strip:
                self.check_scraperesult(num_strips_expected, strip, scraperobj)

    def check_scraperesult(self, num_images_expected, strip, scraperobj):
        # Check that exactly or for multiple pages at least num_strips images are saved.
        # This checks saved files, ie. it detects duplicate filenames.
        saved_images = self.get_saved_images(filtertxt=bool(scraperobj.textSearch))
        num_images = len(saved_images)
        # subtract the number of skipped URLs with no image from the expected image number
        attrs = (num_images, saved_images, num_images_expected, self.tmpdir)
        if self.scraperclass.multipleImagesPerStrip:
            self.check(num_images >= num_images_expected, 'saved %d %s instead of at least %d images in %s' % attrs)
        else:
            self.check(num_images == num_images_expected, 'saved %d %s instead of %d images in %s' % attrs)

    def check_stripurl(self, strip):
        if not self.scraperclass.stripUrl:
            # no indexing support
            return
        # test that the stripUrl regex matches the retrieved strip URL
        urlmatch = re.escape(self.scraperclass.stripUrl)
        urlmatch = urlmatch.replace(r"\%s", r".+")
        urlmatch = "^%s$" % urlmatch
        ro = re.compile(urlmatch)
        mo = ro.search(strip.stripUrl)
        self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))

    def save(self, image):
        try:
            image.save(self.tmpdir)
        except Exception as msg:
            self.check(False, 'could not save %s at %s to %s: %s' % (image.url, image.referrer, self.tmpdir, msg))

    def check(self, condition, msg):
        self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))


def make_comic_tester(name, **kwargs):
    """Create and return a _ComicTester class with given name and attributes."""
    return type(name, (_ComicTester,), kwargs)


def generate_comic_testers():
    """For each comic scraper, create a test class."""
    g = globals()
    if "TESTALL" in os.environ:
        # test all comics (this will take some time)
        scraperclasses = scraper.get_scraperclasses()
    else:
        # Get limited number of scraper tests on Travis builds to make
        # it faster
        testscrapernames = ['GoComics/CalvinandHobbes']
        scraperclasses = [
            scraperclass for scraperclass in scraper.get_scraperclasses()
            if scraperclass.getName() in testscrapernames
        ]
    for scraperclass in scraperclasses:
        name = 'Test'+scraperclass.__name__
        g[name] = make_comic_tester(name, scraperclass=scraperclass)


generate_comic_testers()
Updated copyright for all source files. 2012-06-20 20:41:04 +00:00			`# -- coding: iso-8859-1 --`
			`# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2014-01-05 15:50:57 +00:00			`# Copyright (C) 2012-2014 Bastian Kleineidam`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`import tempfile`
			`import shutil`
Fix some comics. 2012-11-13 18:12:28 +00:00			`import re`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`import os`
Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00			`import multiprocessing`
Fix for python 3.3 2013-11-07 20:22:38 +00:00			`try:`
			`from urllib.parse import urlsplit`
			`except ImportError:`
			`from urlparse import urlsplit`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`from unittest import TestCase`
			`from dosagelib import scraper`


Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00			`def get_host(url):`
			`"""Get host part of URL."""`
Fix for python 3.3 2013-11-07 20:22:38 +00:00			`return urlsplit(url)[1].lower()`
Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00

			`# Dictionary with per-host locks.`
			`_locks = {}`
			`# Allowed number of connections per host`
			`MaxConnections = 4`

			`def get_lock(host):`
			`"""Get bounded semphore for given host."""`
			`if host not in _locks:`
			`_locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)`
			`return _locks[host]`


Initial commit to Github. 2012-06-20 19:58:13 +00:00			`class _ComicTester(TestCase):`
			`"""Basic comic test class."""`
			`scraperclass=None`

Fix test running. 2012-10-11 12:40:54 +00:00			`def setUp(self):`
Fix tests with newer pytest versions. 2013-06-28 19:25:55 +00:00			`if self.scraperclass is not None:`
			`self.name = self.scraperclass.getName()`
			`self.url = self.scraperclass.starter()`
			`# create a temporary directory for images`
			`self.tmpdir = tempfile.mkdtemp()`
Initialize tmpdir. 2013-06-28 19:34:01 +00:00			`else:`
			`self.tmpdir = None`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00
			`def tearDown(self):`
Initialize tmpdir. 2013-06-28 19:34:01 +00:00			`if self.tmpdir is not None:`
			`shutil.rmtree(self.tmpdir)`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00
Consider text files in result checks. 2013-12-05 17:29:15 +00:00			`def get_saved_images(self, filtertxt=False):`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`"""Get saved images."""`
			`dirs = tuple(self.name.split('/'))`
Consider text files in result checks. 2013-12-05 17:29:15 +00:00			`files = os.listdir(os.path.join(self.tmpdir, *dirs))`
			`if filtertxt:`
			`files = [x for x in files if not x.endswith(".txt")]`
			`return files`
Fix test running. 2012-10-11 12:40:54 +00:00
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`def test_comic(self):`
Fix tests with newer pytest versions. 2013-06-28 19:25:55 +00:00			`if self.scraperclass is None:`
			`# only run subclasses`
			`import pytest`
			`pytest.skip("base class")`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`# Test a scraper. It must be able to traverse backward for`
Fix some comics. 2012-11-13 18:12:28 +00:00			`# at least 5 strips from the start, and find strip images`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`# on at least 4 pages.`
Fix test running. 2012-10-11 12:40:54 +00:00			`scraperobj = self.scraperclass()`
Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00			`# Limit number of connections to one host.`
			`host = get_host(scraperobj.url)`
Fall back to non-locked tests on systems without interprocess locking permission (eg. Travis). 2013-03-01 06:27:41 +00:00			`try:`
			`with get_lock(host):`
			`self._test_comic(scraperobj)`
			`except OSError:`
			`# interprocess lock not supported`
Limit connections to hosts for tests. 2013-02-28 20:08:43 +00:00			`self._test_comic(scraperobj)`

			`def _test_comic(self, scraperobj):`
Improve comic strip and image checks. 2013-04-04 16:30:03 +00:00			`num_strips = 0`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`max_strips = 5`
Improve comic strip and image checks. 2013-04-04 16:30:03 +00:00			`strip = None`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`for strip in scraperobj.getStrips(max_strips):`
Fix more comics. 2012-12-07 23:45:18 +00:00			`images = []`
Fix empty image list test. 2012-10-11 13:17:01 +00:00			`for image in strip.getImages():`
Fix more comics. 2012-12-07 23:45:18 +00:00			`images.append(image.url)`
Fix test running. 2012-10-11 12:40:54 +00:00			`self.save(image)`
Fix more comics. 2012-12-07 23:45:18 +00:00			`self.check(images, 'failed to find images at %s' % strip.stripUrl)`
Fix comics. 2012-12-04 06:02:40 +00:00			`if not self.scraperclass.multipleImagesPerStrip:`
Fix more comics. 2012-12-07 23:45:18 +00:00			`self.check(len(images) == 1, 'found more than 1 image at %s: %s' % (strip.stripUrl, images))`
Improve comic strip and image checks. 2013-04-04 16:30:03 +00:00			`if num_strips > 0 and self.scraperclass.prevUrlMatchesStripUrl:`
Fix some comics. 2012-11-21 20:57:26 +00:00			`self.check_stripurl(strip)`
Improve comic strip and image checks. 2013-04-04 16:30:03 +00:00			`num_strips += 1`
			`if self.scraperclass.prevSearch and not scraperobj.hitFirstStripUrl:`
			`# check strips`
			`num_strips_expected = max_strips - len(scraperobj.skippedUrls)`
			`msg = 'Traversed %d strips instead of %d.' % (num_strips, num_strips_expected)`
			`if strip:`
			`msg += " Check the prevSearch pattern at %s" % strip.stripUrl`
			`self.check(num_strips == num_strips_expected, msg)`
			`# check images`
			`if strip:`
			`self.check_scraperesult(num_strips_expected, strip, scraperobj)`
Fix comic test with zero strip images. 2013-03-19 19:45:45 +00:00
Improve comic strip and image checks. 2013-04-04 16:30:03 +00:00			`def check_scraperesult(self, num_images_expected, strip, scraperobj):`
			`# Check that exactly or for multiple pages at least num_strips images are saved.`
			`# This checks saved files, ie. it detects duplicate filenames.`
Consider text files in result checks. 2013-12-05 17:29:15 +00:00			`saved_images = self.get_saved_images(filtertxt=bool(scraperobj.textSearch))`
Fix comic test with zero strip images. 2013-03-19 19:45:45 +00:00			`num_images = len(saved_images)`
			`# subtract the number of skipped URLs with no image from the expected image number`
			`attrs = (num_images, saved_images, num_images_expected, self.tmpdir)`
			`if self.scraperclass.multipleImagesPerStrip:`
			`self.check(num_images >= num_images_expected, 'saved %d %s instead of at least %d images in %s' % attrs)`
			`else:`
			`self.check(num_images == num_images_expected, 'saved %d %s instead of %d images in %s' % attrs)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
Fix some comics. 2012-11-21 20:57:26 +00:00			`def check_stripurl(self, strip):`
			`if not self.scraperclass.stripUrl:`
			`# no indexing support`
			`return`
			`# test that the stripUrl regex matches the retrieved strip URL`
			`urlmatch = re.escape(self.scraperclass.stripUrl)`
			`urlmatch = urlmatch.replace(r"\%s", r".+")`
			`urlmatch = "^%s$" % urlmatch`
			`ro = re.compile(urlmatch)`
			`mo = ro.search(strip.stripUrl)`
			`self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))`

Fix test running. 2012-10-11 12:40:54 +00:00			`def save(self, image):`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`try:`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`image.save(self.tmpdir)`
Updated documentation and fix some comics. 2012-11-20 17:53:53 +00:00			`except Exception as msg:`
Improve error message. 2013-02-20 19:51:15 +00:00			`self.check(False, 'could not save %s at %s to %s: %s' % (image.url, image.referrer, self.tmpdir, msg))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
Fix test running. 2012-10-11 12:40:54 +00:00			`def check(self, condition, msg):`
Improve test output. 2012-10-11 18:19:10 +00:00			`self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))`
Fix test running. 2012-10-11 12:40:54 +00:00
Initial commit to Github. 2012-06-20 19:58:13 +00:00
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`def make_comic_tester(name, **kwargs):`
			`"""Create and return a _ComicTester class with given name and attributes."""`
			`return type(name, (_ComicTester,), kwargs)`


Initial commit to Github. 2012-06-20 19:58:13 +00:00			`def generate_comic_testers():`
Limit comic tests for now. 2012-10-11 12:55:54 +00:00			`"""For each comic scraper, create a test class."""`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`g = globals()`
Fix random test case. 2013-11-08 17:40:55 +00:00			`if "TESTALL" in os.environ:`
			`# test all comics (this will take some time)`
Limit number of tests for Travis CI. 2013-02-15 19:39:20 +00:00			`scraperclasses = scraper.get_scraperclasses()`
Fix random test case. 2013-11-08 17:40:55 +00:00			`else:`
			`# Get limited number of scraper tests on Travis builds to make`
			`# it faster`
			`testscrapernames = ['GoComics/CalvinandHobbes']`
			`scraperclasses = [`
			`scraperclass for scraperclass in scraper.get_scraperclasses()`
			`if scraperclass.getName() in testscrapernames`
			`]`
Limit number of tests for Travis CI. 2013-02-15 19:39:20 +00:00			`for scraperclass in scraperclasses:`
Fix test running. 2012-10-11 12:40:54 +00:00			`name = 'Test'+scraperclass.__name__`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`g[name] = make_comic_tester(name, scraperclass=scraperclass)`

Initial commit to Github. 2012-06-20 19:58:13 +00:00
			`generate_comic_testers()`