dosage/tests/test_comics.py

135 lines
5 KiB
Python
Raw Normal View History

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
2013-02-10 17:25:21 +00:00
# Copyright (C) 2012-2013 Bastian Kleineidam
2012-06-20 19:58:13 +00:00
import tempfile
import shutil
2012-11-13 18:12:28 +00:00
import re
import os
2013-02-28 20:08:43 +00:00
import multiprocessing
import urlparse
2012-10-11 12:55:54 +00:00
from itertools import islice
2012-06-20 19:58:13 +00:00
from unittest import TestCase
from dosagelib import scraper
2013-02-28 20:08:43 +00:00
def get_host(url):
"""Get host part of URL."""
return urlparse.urlsplit(url)[1].lower()
# Dictionary with per-host locks.
_locks = {}
# Allowed number of connections per host
MaxConnections = 4
def get_lock(host):
"""Get bounded semphore for given host."""
if host not in _locks:
_locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)
return _locks[host]
2012-06-20 19:58:13 +00:00
class _ComicTester(TestCase):
"""Basic comic test class."""
scraperclass=None
2012-10-11 12:40:54 +00:00
def setUp(self):
self.name = self.scraperclass.getName()
2012-10-11 18:19:10 +00:00
self.url = self.scraperclass.starter()
# create a temporary directory for images
self.tmpdir = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.tmpdir)
def get_saved_images(self):
"""Get saved images."""
dirs = tuple(self.name.split('/'))
return os.listdir(os.path.join(self.tmpdir, *dirs))
2012-10-11 12:40:54 +00:00
2012-06-20 19:58:13 +00:00
def test_comic(self):
# Test a scraper. It must be able to traverse backward for
2012-11-13 18:12:28 +00:00
# at least 5 strips from the start, and find strip images
2012-06-20 19:58:13 +00:00
# on at least 4 pages.
2012-10-11 12:40:54 +00:00
scraperobj = self.scraperclass()
2013-02-28 20:08:43 +00:00
# Limit number of connections to one host.
host = get_host(scraperobj.url)
try:
with get_lock(host):
self._test_comic(scraperobj)
except OSError:
# interprocess lock not supported
2013-02-28 20:08:43 +00:00
self._test_comic(scraperobj)
def _test_comic(self, scraperobj):
2012-12-04 06:02:40 +00:00
num = 0
max_strips = 5
for strip in scraperobj.getStrips(max_strips):
2012-12-07 23:45:18 +00:00
images = []
2012-10-11 13:17:01 +00:00
for image in strip.getImages():
2012-12-07 23:45:18 +00:00
images.append(image.url)
2012-10-11 12:40:54 +00:00
self.save(image)
2012-12-07 23:45:18 +00:00
self.check(images, 'failed to find images at %s' % strip.stripUrl)
2012-12-04 06:02:40 +00:00
if not self.scraperclass.multipleImagesPerStrip:
2012-12-07 23:45:18 +00:00
self.check(len(images) == 1, 'found more than 1 image at %s: %s' % (strip.stripUrl, images))
2012-12-05 20:52:52 +00:00
if num > 0 and self.scraperclass.prevUrlMatchesStripUrl:
2012-11-21 20:57:26 +00:00
self.check_stripurl(strip)
2012-06-20 19:58:13 +00:00
num += 1
2013-02-21 18:48:21 +00:00
if self.scraperclass.prevSearch and not scraperobj.hitFirstStripUrl:
2012-12-04 06:02:40 +00:00
self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern at %s.' % (num, strip.stripUrl))
# Check that exactly or for multiple pages at least 5 images are saved.
# This is different than the image number check above since it checks saved files,
# ie. it detects duplicate filenames.
saved_images = self.get_saved_images()
num_images = len(saved_images)
2013-02-20 19:51:39 +00:00
# subtract the number of skipped URLs with no image from the expected image number
num_images_expected = max_strips - len(scraperobj.skippedUrls)
2013-02-18 19:03:27 +00:00
attrs = (num_images, saved_images, num_images_expected, self.tmpdir)
if self.scraperclass.multipleImagesPerStrip:
2013-02-18 19:03:27 +00:00
self.check(num_images >= num_images_expected, 'saved %d %s instead of at least %d images in %s' % attrs)
else:
2013-02-18 19:03:27 +00:00
self.check(num_images == num_images_expected, 'saved %d %s instead of %d images in %s' % attrs)
2012-06-20 19:58:13 +00:00
2012-11-21 20:57:26 +00:00
def check_stripurl(self, strip):
if not self.scraperclass.stripUrl:
# no indexing support
return
# test that the stripUrl regex matches the retrieved strip URL
urlmatch = re.escape(self.scraperclass.stripUrl)
urlmatch = urlmatch.replace(r"\%s", r".+")
urlmatch = "^%s$" % urlmatch
ro = re.compile(urlmatch)
mo = ro.search(strip.stripUrl)
self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))
2012-10-11 12:40:54 +00:00
def save(self, image):
2012-06-20 19:58:13 +00:00
try:
image.save(self.tmpdir)
except Exception as msg:
2013-02-20 19:51:15 +00:00
self.check(False, 'could not save %s at %s to %s: %s' % (image.url, image.referrer, self.tmpdir, msg))
2012-06-20 19:58:13 +00:00
2012-10-11 12:40:54 +00:00
def check(self, condition, msg):
2012-10-11 18:19:10 +00:00
self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))
2012-10-11 12:40:54 +00:00
2012-06-20 19:58:13 +00:00
def make_comic_tester(name, **kwargs):
"""Create and return a _ComicTester class with given name and attributes."""
return type(name, (_ComicTester,), kwargs)
2012-06-20 19:58:13 +00:00
def generate_comic_testers():
2012-10-11 12:55:54 +00:00
"""For each comic scraper, create a test class."""
g = globals()
2013-02-15 19:39:20 +00:00
if "TRAVIS" in os.environ:
2013-02-18 19:55:48 +00:00
# Get limited number of scraper tests on Travis builds.
2013-03-07 22:51:55 +00:00
max_scrapers = 300
2013-02-15 19:39:20 +00:00
scraperclasses = islice(scraper.get_scraperclasses(), 0, max_scrapers)
2012-12-12 16:41:29 +00:00
else:
2013-02-15 19:39:20 +00:00
scraperclasses = scraper.get_scraperclasses()
for scraperclass in scraperclasses:
2012-10-11 12:40:54 +00:00
name = 'Test'+scraperclass.__name__
g[name] = make_comic_tester(name, scraperclass=scraperclass)
2012-06-20 19:58:13 +00:00
generate_comic_testers()