Fix comics, improve tests, use python-requests.

2012-11-26 18:44:31 +01:00 · 2012-11-26 18:44:31 +01:00 · 0556ffd30a
commit 0556ffd30a
parent d4eee7719d
16 changed files with 191 additions and 403 deletions
--- a/doc/README.txt
+++ b/doc/README.txt
@ -40,10 +40,11 @@ manual page.

 Dependencies
 -------------
-Dosage requires Python version 2.7 or higher, which can be downloaded
-from http://www.python.org.
-No external Python modules are required - only the Python Standard Library
-that gets installed with Python.
+Python version 2.7 or higher, which can be downloaded
+from http://www.python.org/
+
+Also the python-requests module must be installed, which can be downloaded
+from http://docs.python-requests.org/en/latest/

 Installation
 -------------
@ -59,7 +60,7 @@ or if you do not have root permissions:

 Technical Description
 ----------------------
-Dosage is written entirely in Python and relies on regular expressions to
+Dosage is written in Python and relies on regular expressions to
 do most of the grunt work.

 For each webcomic Dosage has a plugin module, found in the "plugins"
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -4,6 +4,7 @@ Features:
 - cmdline: Added proper return codes for error conditions.
 - comics: Added more robust regular expressions for HTML tags.
  They match case insensitive and ignore whitespaces now.
+- comics: Use the python-requests module for HTTP requests.

 Changes:
 - installation: Added support for dynamic configuration values.
--- a/dosagelib/comic.py
+++ b/dosagelib/comic.py
@ -2,7 +2,6 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012 Bastian Kleineidam

-import urllib2
 import os
 import locale
 import rfc822
@ -55,18 +54,24 @@ class ComicImage(object):
        """Connect to host and get meta information."""
        try:
            self.urlobj = urlopen(self.url, referrer=self.referrer)
-        except urllib2.HTTPError as he:
+        except IOError as he:
            raise FetchComicError('Unable to retrieve URL.', self.url, he.code)

-        if self.urlobj.info().getmaintype() != 'image' and \
-           self.urlobj.info().gettype() not in ('application/octet-stream', 'application/x-shockwave-flash'):
+        content_type = self.urlobj.headers.get('content-type')
+        content_type = content_type.split(';', 1)[0]
+        if '/' in content_type:
+            maintype, subtype = content_type.split('/', 1)
+        else:
+            maintype = content_type
+            subtype = None
+        if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
            raise FetchComicError('No suitable image found to retrieve.', self.url)

        # Always use mime type for file extension if it is sane.
-        if self.urlobj.info().getmaintype() == 'image':
-            self.ext = '.' + self.urlobj.info().getsubtype().replace('jpeg', 'jpg')
-        self.contentLength = int(self.urlobj.info().get('content-length', 0))
-        self.lastModified = self.urlobj.info().get('last-modified')
+        if maintype == 'image':
+            self.ext = '.' + subtype.replace('jpeg', 'jpg')
+        self.contentLength = int(self.urlobj.headers.get('content-length', 0))
+        self.lastModified = self.urlobj.headers.get('last-modified')
        out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2)

    def touch(self, filename):
@ -88,7 +93,6 @@ class ComicImage(object):

        fn = os.path.join(comicDir, filename)
        if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
-            self.urlobj.close()
            self.touch(fn)
            out.write('Skipping existing file "%s".' % (fn,), 1)
            return fn, False
@ -97,7 +101,7 @@ class ComicImage(object):
            out.write('Writing comic to file %s...' % (fn,), 3)
            with open(fn, 'wb') as comicOut:
                startTime = time.time()
-                comicOut.write(self.urlobj.read())
+                comicOut.write(self.urlobj.content)
                endTime = time.time()
            self.touch(fn)
        except:
@ -114,7 +118,5 @@ class ComicImage(object):
            attrs = dict(fn=fn, bytes=bytes, speed=speed)
            out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
            getHandler().comicDownloaded(self.name, fn)
-        finally:
-            self.urlobj.close()

        return fn, True
--- a/dosagelib/plugins/d.py
+++ b/dosagelib/plugins/d.py
@ -13,7 +13,7 @@ from ..util import tagre, getQueryParams
 class DMFA(_BasicScraper):
    latestUrl = 'http://www.missmab.com/'
    stripUrl = latestUrl + 'Comics/Vol_%s.php'
-    imageSearch = compile(tagre("img", "src", r'(Comics/|Vol)[^"]+)'))
+    imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
    prevSearch = compile(tagre("a", "href", r'([^"])+')+
      tagre("img", "src", r'(?:../)?Images/comicprev.gif'))
    help = 'Index format: nnn (normally, some specials)'
--- a/dosagelib/plugins/drunkduck.py
+++ b/dosagelib/plugins/drunkduck.py
@ -4,22 +4,29 @@

 from re import compile
 from ..scraper import make_scraper
-from ..helpers import bounceStarter, queryNamer
+from ..helpers import bounceStarter
 from ..util import tagre


 def add(name):
    classname = 'DrunkDuck_%s' % name
    url = 'http://www.drunkduck.com/%s/' % name
-    linkSearch = tagre("a", "href", r"(/[^/]*/index\.php\?p=\d+)", quote="'", after="The %s page")
+    linkSearch = tagre("a", "href", r"(/%s/\d+/)" % name)
+
+    @classmethod
+    def namer(cls, imageUrl, pageUrl):
+        index = int(pageUrl.rstrip('/').split('/')[-1])
+        ext = imageUrl.rsplit('.')[-1]
+        return '%d.%s' % (index, ext)
+
    globals()[classname] = make_scraper(classname,
        name = 'DrunkDuck/' + name,
-        starter = bounceStarter(url, compile(linkSearch % 'next')),
-        stripUrl = url + 'index.php?p=%s' % name,
-        imageSearch = compile(tagre("img", "src", r"(http://[a-z0-9]*\.drunkduck\.com/[^/]*/pages/[^'/]+)", quote="'")),
-        prevSearch= compile(linkSearch % 'previous'),
+        starter = bounceStarter(url, compile(linkSearch + tagre("img", "class", "arrow_next"))),
+        stripUrl = url + '%s/',
+        imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image")),
+        prevSearch= compile(linkSearch + tagre("img", "class", "arrow_prev")),
        help = 'Index format: n (unpadded)',
-        namer = queryNamer('p', usePageUrl=True),
+        namer = namer,
    )

 comics = (
--- a/dosagelib/plugins/fallenangel.py
+++ b/dosagelib/plugins/fallenangel.py
@ -1,47 +1,26 @@
 # -*- coding: iso-8859-1 -*-
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012 Bastian Kleineidam
-from ..scraper import _BasicScraper
+
+from re import compile
+from ..scraper import make_scraper
+from ..util import asciify


-def fallenangel(name, shortname):
-    pass # XXX
-
-class _TheFallenAngel(_BasicScraper):
-    imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"')
-    prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"')
-    help = 'Index format: yyyymmdd'
-
-    @property
-    def baseUrl(self):
-        return 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % (self.shortName,)
-
-
-    @property
-    def stripUrl(self):
-        return self.baseUrl + '?date=%s'
-
-
-    def starter(self):
-        return self.baseUrl
-
-
-
-class HighMaintenance(_TheFallenAngel):
-    name = 'TheFallenAngel/HighMaintenance'
-    shortName = 'hm'
-
-
-
-class FAWK(_TheFallenAngel):
-    name = 'TheFallenAngel/FAWK'
-    shortName = 'fawk'
-
-
-
-class MalloryChan(_TheFallenAngel):
-    name = 'TheFallenAngel/MalloryChan'
-    shortName = 'mallorychan'
+def add(name, shortname):
+    latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
+    classname = asciify(name)
+    globals()[classname] = make_scraper(classname,
+        latestUrl = latestUrl,
+        stripUrl = latestUrl + '?date=%s',
+        name='FallenAngel/' + name,
+        imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"'),
+        prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"'),
+        help = 'Index format: yyyymmdd',
+    )


+add('HighMaintenance', 'hm')
+add('FAWK', 'fawk')
+add('MalloryChan', 'mallorychan')

--- a/dosagelib/plugins/gocomics.py
+++ b/dosagelib/plugins/gocomics.py
@ -29,30 +29,30 @@ def add(name, repl=''):


 # http://www.gocomics.com/features
-# note that comics from creators.com are not repeated here
+# Duplicate comics from creators.com are commented out
 add('2 Cows and a Chicken')
 add('9 Chickweed Lane')
 add('9 to 5')
 add('The Academia Waltz')
 add('Adam at Home')
-add('Agnes')
+#add('Agnes')
 add('Alley Oop', repl='-')
 add('Andertoons')
-add('Andy Capp')
+#add('Andy Capp')
 add('Angry Little Girls', repl='-')
 add('Animal Crackers')
 add('Annie')
 add('The Argyle Sweater')
 add('Arlo and Janis')
-add('Ask Shagg')
-add('BC')
+#add('Ask Shagg')
+#add('BC')
 add('Back in the Day')
 add('Bad Reporter')
 add('Baldo')
-add('Ballard Street')
+#add('Ballard Street')
 add('Banana Triangle', repl='-')
 add('Barkeater Lake')
-add('The Barn')
+#add('The Barn')
 add('Barney and Clyde')
 add('Basic Instructions')
 add('Beardo')
@ -81,13 +81,13 @@ add('Brewster Rockit')
 add('Broom Hilda')
 add('The Buckets')
 add('Buni')
-add('Cafe con Leche')
+#add('Cafe con Leche')
 add('Calvin and Hobbes')
 add('Candorville')
 add('Cathy')
 add('Cest la Vie')
 add('Cheap Thrills Cuisine', repl='-')
-add('Chuckle Bros')
+#add('Chuckle Bros')
 add('Citizen Dog')
 add('The City')
 add('Cleats')
@ -99,15 +99,15 @@ add('Cow and Boy')
 add('CowTown')
 add('Crumb')
 add('Cul de Sac')
-add('Daddys Home')
+#add('Daddys Home')
 add('Dark Side of the Horse')
 add('Deep Cover')
-add('Diamond Lil')
+#add('Diamond Lil')
 add('Dick Tracy')
-add('The Dinette Set')
+#add('The Dinette Set')
 add('Dixie Drive', repl='-')
-add('Dog Eat Doug')
-add('Dogs of C Kennel')
+#add('Dog Eat Doug')
+#add('Dogs of C Kennel')
 add('Domestic Abuse')
 add('Doonesbury')
 add('The Doozies')
@ -122,18 +122,18 @@ add('F Minus')
 add('Family Tree')
 add('Farcus')
 add('Fat Cats', repl='-')
-add('Flo and Friends')
+#add('Flo and Friends')
 add('The Flying McCoys')
 add('Foolish Mortals', repl='-')
 add('For Better or For Worse')
-add('For Heavens Sake')
+#add('For Heavens Sake')
 add('Fort Knox')
 add('FoxTrot')
 add('FoxTrot Classics')
 add('Frank and Ernest')
 add('Frazz')
 add('Fred Basset')
-add('Free Range')
+#add('Free Range')
 add('Freshly Squeezed')
 add('Frog Applause')
 add('The Fusco Brothers')
@ -154,9 +154,9 @@ add('Haiku Ewe')
 add('Ham Shears')
 add('Health Capsules')
 add('Heart of the City')
-add('Heathcliff')
+#add('Heathcliff')
 add('Heavenly Nostrils')
-add('Herb and Jamaal')
+#add('Herb and Jamaal')
 add('Herman')
 add('Home and Away')
 add('HUBRIS!')
@ -184,7 +184,7 @@ add('La Cucaracha')
 add('Last Kiss')
 add('The LeftyBosco Picture Show')
 add('Legend of Bill')
-add('Liberty Meadows')
+#add('Liberty Meadows')
 add('Lil Abner')
 add('Lio')
 add('Little Dog Lost')
@ -201,7 +201,7 @@ add('Maintaining')
 add('Marias Day')
 add('Marmaduke')
 add('McArroni')
-add('The Meaning of Lila')
+#add('The Meaning of Lila')
 add('Medium Large')
 add('Meg Classics')
 add('The Middletons')
@ -209,7 +209,7 @@ add('Mike du Jour')
 add('Minimum Security')
 add('Moderately Confused')
 add('Molly and the Bear')
-add('Momma')
+#add('Momma')
 add('Monty')
 add('Motley Classics')
 add('Mr. Gigi and the Squid')
@ -217,7 +217,7 @@ add('Mutt and Jeff')
 add('My Cage')
 add('MythTickle')
 add('Nancy')
-add('Nest Heads')
+#add('Nest Heads')
 add('NEUROTICA')
 add('New Adventures of Queen Victoria')
 add('Non Sequitur')
@ -225,10 +225,10 @@ add('The Norm Classics')
 add('Nothing is Not Something')
 add('Off the Mark')
 add('Ollie and Quentin')
-add('On A Claire Day')
-add('One Big Happy')
+#add('On A Claire Day')
+#add('One Big Happy')
 add('Ordinary Bill')
-add('The Other Coast')
+#add('The Other Coast')
 add('Out of the Gene Pool Re-Runs')
 add('Over the Hedge')
 add('Overboard')
@ -254,10 +254,10 @@ add('Reply All')
 add('Rip Haywire')
 add('Ripleys Believe It or Not')
 add('Rose is Rose')
-add('Rubes')
+#add('Rubes')
 add('Rudy Park')
 add('Savage Chickens')
-add('Scary Gary')
+#add('Scary Gary')
 add('Shirley and Son Classics')
 add('Shoe')
 add('Shoecabbage')
@ -266,11 +266,11 @@ add('Skin Horse')
 add('Skippy')
 add('Slowpoke')
 add('Soup to Nutz')
-add('Speed Bump')
+#add('Speed Bump')
 add('Spot the Frog')
 add('Starslip')
 add('Stone Soup')
-add('Strange Brew')
+#add('Strange Brew')
 add('The Sunshine Club')
 add('Sylvia')
 add('Tank McNamara')
@ -280,7 +280,7 @@ add('Tales of TerraTopia')
 add('That is Priceless')
 add('Thats Life')
 add('Thatababy')
-add('Thin Lines')
+#add('Thin Lines')
 add('Tiny Sepuku')
 add('TOBY')
 add('Todays Dogg')
@ -293,12 +293,12 @@ add('Unstrange Phenomena')
 add('U.S. Acres')
 add('Viivi and Wagner')
 add('Watch Your Head')
-add('Wee Pals')
-add('Wizard of Id')
+#add('Wee Pals')
+#add('Wizard of Id')
 add('Working Daze')
-add('Working It Out')
+#add('Working It Out')
 add('W.T. Duck')
-add('Zack Hill')
+#add('Zack Hill')
 add('Ziggy')

 # http://www.gocomics.com/explore/editorial_list
--- a/dosagelib/plugins/keenspot.py
+++ b/dosagelib/plugins/keenspot.py
@ -18,9 +18,9 @@ def add(name, urls):
        name='KeenSpot/' + name,
        latestUrl=latestUrl,
        stripUrl=baseUrl + 'd/%s.html',
-        imageSearch = compile(tagre("img", "src", r'([^"]*comics/[^"]+)')),
-        prevSearch = compile(tagre("a", "href", r'"([^"]*d/\d{8}\.html)') +
-           '(?:<img[^>]+?(?:name="previous_day"|alt="Previous"|src="[^"]*back[^"]*")|Previous comic)'),
+        imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')),
+        prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
+           '(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')'),
        help = 'Index format: yyyymmdd',
    )

--- a/dosagelib/plugins/num.py
+++ b/dosagelib/plugins/num.py
@ -11,6 +11,7 @@ from ..scraper import _BasicScraper
 class NineteenNinetySeven(_BasicScraper):
    name = '1997'
    latestUrl = 'http://www.1977thecomic.com/'
+    stripUrl = latestUrl + '%s'
    imageSearch = compile(tagre("img", "src", r'(http://www\.1977thecomic\.com/comics-1977/[^"]+)'))
    prevSearch = compile(tagre("a", "href", r'([^"]+)')+"Previous")
    help = 'Index format: yyyy/mm/dd/strip-name'
--- a/dosagelib/plugins/s.py
+++ b/dosagelib/plugins/s.py
@ -61,7 +61,7 @@ class Sheldon(_BasicScraper):
    latestUrl = 'http://www.sheldoncomics.com/'
    stripUrl = latestUrl + 'archive/%s.html'
    imageSearch = compile(tagre("img", "src", r'(/strips/[^"]+)'))
-    prevSearch = compile(tagre("a", "href", r'/archive/\d+\.html)', after="sidenav-prev"))
+    prevSearch = compile(tagre("a", "href", r'(/archive/\d+\.html)', after="sidenav-prev"))
    help = 'Index format: yymmdd'


--- a/dosagelib/plugins/uc.py
+++ b/dosagelib/plugins/uc.py
@ -1,280 +1,54 @@
 # -*- coding: iso-8859-1 -*-
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012 Bastian Kleineidam
-
-from re import compile, sub
+"""
+The Universal comics only have some samples, but those samples are always the newest ones.
+"""
+import datetime
+from re import compile, escape
 from ..scraper import make_scraper
-from ..util import fetchUrl, tagre
+from ..util import tagre, asciify, getPageContent


-def add(name, shortName):
-    homepage = 'http://content.uclick.com/a2z.html'
-    baseUrl = 'http://www.uclick.com/client/zzz/%s/'
-    latestUrl = baseUrl % shortName
-    classname = 'UClick_%s' % name
+def parse_strdate(strdate):
+    """Parse date string. XXX this is locale dependant but it should not be."""
+    return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
+
+
+def add(name, category):
+    shortname = name.replace(' ', '').lower()
+    latestUrl = 'http://www.universaluclick.com/comics/%s/%s' % (category, shortname)
+    classname = 'UClick_%s' % asciify(name)

    @classmethod
-    def fetchSubmodules(cls):
-        exclusions = ('index',)
-        # XXX refactor this mess
-        submoduleSearch = compile(tagre("a", "href", r'(http://content\.uclick\.com/content/\w+\.html)'))
-        partsMatch = compile(tagre("a", "href", r'http://content\.uclick\.com/content/(\w+?)\.html'))
-        matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]
-        possibles = [partsMatch.match(match).groups() for match in matches]
-
-        def normalizeName(name):
-            name = sub(r'&(.)acute;', r'\1', name).title()
-            return ''.join([c for c in name if c.isalnum()])
-
-        def fetchSubmodule(module):
-            try:
-                return fetchUrl(cls.baseUrl % module, cls.imageSearch)
-            except Exception:
-                # XXX log error
-                return False
-
-        return [normalizeName(name) for part, name in possibles if part not in exclusions and fetchSubmodule(part)]
+    def namer(cls, imageUrl, pageUrl):
+        """Parse publish date from page content which looks like:
+         <img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
+         <h4>published: Sunday, November 11, 2012</h4>
+        """
+        data = getPageContent(pageUrl)[0]
+        ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
+        mo = ro.search(data)
+        if mo:
+             strdate = mo.group(1)
+             return parse_strdate(strdate).strftime("%Y%m%d")

    globals()[classname] = make_scraper(classname,
        name='UClick/' + name,
        latestUrl = latestUrl,
        stripUrl = latestUrl + '%s/',
-        imageSearch = compile(tagre("img", "src", r'(http://synd\.imgsrv\.uclick\.com/comics/\w+/\d{4}/[^"]+\.gif)')),
-        prevSearch = compile(tagre("a", "href", r'(/client/zzz/\w+/\d{4}/\d{2}/\d{2}/)') + 'Previous date'),
-        help = 'Index format: yyyy/mm/dd',
+        imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published'),
+        multipleImagesPerStrip = True,
+        prevSearch = None,
+        help = 'Index format: none',
+        namer = namer,
    )


+# List is from http://www.universaluclick.com/comics/list
 comics = {
-    '5thWave': 'fw',
-    '9To5': 'tmntf',
-    'AdamHome': 'ad',
-    'Agnes': 'cragn',
-    'AlcarazLalo': 'la',
-    'AlcarazLaloSpanish': 'spla',
-    'AndersonNick': 'wpnan',
-    'AndyCapp': 'crcap',
-    'AnimalCrackers': 'tmani',
-    'Annie': 'tmann',
-    'AsayChuck': 'crcas',
-    'AskShagg': 'crask',
-    'AuthTony': 'ta',
-    'BadReporter': 'bad',
-    'Baldo': 'ba',
-    'BaldoSpanish': 'be',
-    'BallardStreet': 'crbal',
-    'BarkEaterLake': 'bark',
-    'BarstowDonna': 'dba',
-    'BC': 'crbc',
-    'BCSpanish': 'crbcs',
-    'BeattieBruce': 'crbbe',
-    'BennetClay': 'wpcbe',
-    'BensonLisa': 'wplbe',
-    'BensonSteve': 'crsbe',
-    'BigTop': 'bt',
-    'Biographic': 'biov',
-    'Bleeker': 'blk',
-    'BobTheSquirrel': 'bob',
-    'BoilingPoint': 'boil',
-    'BokChip': 'crcbo',
-    'BoNanas': 'bon',
-    'Boomerangs': 'boom',
-    'BoondocksThe': 'bo',
-    'BottomLiners': 'tmbot',
-    'BoundAndGagged': 'tmbou',
-    'Brainwaves': 'bwv',
-    'BreenSteve': 'crsbr',
-    'BrendaStarr': 'tmbre',
-    'BrewsterRockit': 'tmrkt',
-    'BrittChris': 'crcbr',
-    'BroomHilda': 'tmbro',
-    'Candorville': 'cand',
-    'CarlsonStuart': 'sc',
-    'CatalinoKen': 'crkca',
-    'Cathy': 'ca',
-    'CathySpanish': 'spca',
-    'CEstLaVie': 'clv',
-    'CityThe': 'derf',
-    'ClearBlueWater': 'cbw',
-    'Cleats': 'cle',
-    'CloseToHome': 'cl',
-    'CombsPaul': 'tmcmb',
-    'CompuToon': 'tmcom',
-    'Condorito': 'cond',
-    'ConradPaul': 'tmpco',
-    'Cornered': 'co',
-    'CulDeSac': 'cds',
-    'DanzigerJeff': 'jd',
-    'DaviesMatt': 'tmmda',
-    'DeepCover': 'deep',
-    'DeeringJohn': 'crjde',
-    'DickTracy': 'tmdic',
-    'DinetteSetThe': 'crdin',
-    'DogEatDoug': 'crdog',
-    'DonWright': 'tmdow',
-    'Doodles': 'tmdoo',
-    'Doonesbury': 'db',
-    'DuplexThe': 'dp',
-    'Eek': 'eek',
-    'ElderberriesThe': 'eld',
-    'FacesInTheNews': 'kw',
-    'FlightDeck': 'crfd',
-    'FloAndFriends': 'crflo',
-    'FlyingMccoysThe': 'fmc',
-    'ForBetterOrForWorse': 'fb',
-    'ForHeavenSSake': 'crfhs',
-    'FoxtrotClassics': 'ftcl',
-    'Foxtrot': 'ft',
-    'FoxtrotSpanish': 'spft',
-    'FrankAndErnest': 'fa',
-    'FredBassetSpanish': 'spfba',
-    'FredBasset': 'tmfba',
-    'FrogApplause': 'frog',
-    'FuscoBrothersThe': 'fu',
-    'Garfield': 'ga',
-    'GarfieldSpanish': 'gh',
-    'GasolineAlley': 'tmgas',
-    'GaturroSpanish': 'spgat',
-    'GilThorp': 'tmgil',
-    'GingerMeggs': 'gin',
-    'GingerMeggsSpanish': 'spgin',
-    'GirlsAndSports': 'crgis',
-    'GorrellBob': 'crbgo',
-    'GoTeamBob': 'gtb',
-    'HammondBruce': 'hb',
-    'HandelsmanWalt': 'tmwha',
-    'HeartOfTheCity': 'hc',
-    'Heathcliff': 'crhea',
-    'HeathcliffSpanish': 'crhes',
-    'HerbAndJamaal': 'crher',
-    'HigginsJack': 'jh',
-    'HomeAndAway': 'wphaa',
-    'HorseyDavid': 'tmdho',
-    'Housebroken': 'tmhou',
-    'HubertAndAbby': 'haa',
-    'IdiotBox': 'ibox',
-    'ImagineThis': 'imt',
-    'InkPen': 'ink',
-    'InTheBleachers': 'bl',
-    'ItsAllAboutYou': 'wpiay',
-    'JamesBondSpanish': 'spjb',
-    'JonesClay': 'crcjo',
-    'KallaugherKevin': 'cwkal',
-    'KChroniclesThe': 'kk',
-    'KelleySteve': 'crske',
-    'Kudzu': 'tmkud',
-    'LaCucaracha': 'lc',
-    'LegendOfBill': 'lob',
-    'LibertyMeadows': 'crlib',
-    'Lio': 'lio',
-    'LittleDogLost': 'wpldl',
-    'LocherDick': 'tmdlo',
-    'LooseParts': 'tmloo',
-    'LostSheep': 'lost',
-    'LoweChan': 'tmclo',
-    'LuckovichMike': 'crmlu',
-    'LuckyCow': 'luc',
-    'MarkstienGary': 'crgma',
-    'MarletteDoug': 'tmdma',
-    'MccoyGlenn': 'gm',
-    'MeaningOfLilaThe': 'crlil',
-    'MeehanStreak': 'tmmee',
-    'MiddletonsThe': 'tmmid',
-    'MinimumSecurity': 'ms',
-    'ModestyBlaiseSpanish': 'spmb',
-    'Momma': 'crmom',
-    'MorinJim': 'cwjmo',
-    'MuttJeffSpanish': 'spmut',
-    'MythTickle': 'myth',
-    'NAoQV': 'naqv',
-    'NaturalSelection': 'crns',
-    'NestHeads': 'cpnst',
-    'Neurotica': 'neu',
-    'NonSequitur': 'nq',
-    'OhmanJack': 'tmjoh',
-    'OliphantPat': 'po',
-    'OnAClaireDay': 'crocd',
-    'OneBigHappy': 'crobh',
-    'OtherCoastThe': 'crtoc',
-    'OutOfTheGenePool': 'wpgen',
-    'Overboard': 'ob',
-    'OverboardSpanish': 'spob',
-    'PepeSpanish': 'sppep',
-    'PettJoel': 'jp',
-    'Pibgorn': 'pib',
-    'Pickles': 'wppic',
-    'Pluggers': 'tmplu',
-    'PoochCafe': 'poc',
-    'PoochCafeSpanish': 'sppoc',
-    'PopCulture': 'pop',
-    'PowellDwane': 'crdpo',
-    'Preteena': 'pr',
-    'PricklyCity': 'prc',
-    'QuigmansThe': 'tmqui',
-    'RallComic': 'tr',
-    'RamirezMicheal': 'crmrm',
-    'RamseyMarshall': 'crmra',
-    'RealLifeAdventures': 'rl',
-    'RedAndRover': 'wpred',
-    'RedMeat': 'red',
-    'ReynoldsUnwrapped': 'rw',
-    'RonaldinhoGaucho': 'ron',
-    'RonaldinhoGauchoSpanish': 'spron',
-    'Rubes': 'crrub',
-    'SackSteve': 'tmssa',
-    'SargentBen': 'bs',
-    'SargentBenSpanish': 'spbs',
-    'SendHelp': 'send',
-    'ShenemanDrew': 'tmdsh',
-    'SherffiusDrew': 'crjsh',
-    'Shoecabbage': 'shcab',
-    'Shoe': 'tmsho',
-    'SigmundSpanish': 'spsig',
-    'Slowpoke': 'slow',
-    'SmallWorld': 'small',
-    'SpaceIsThePlace': 'sitp',
-    'SpeedBump': 'crspe',
-    'StanisScott': 'crsst',
-    'StateOfTheUnion': 'crsou',
-    'StayskalWayne': 'tmwst',
-    'StoneSoup': 'ss',
-    'StrangeBrew': 'crstr',
-    'SummersDana': 'tmdsu',
-    'SuttonImpact': 'stn',
-    'Sylvia': 'tmsyl',
-    'SzepPaul': 'crpsz',
-    'TankMcnamara': 'tm',
-    'TeenageMutantNinjaTurtles': 'tmnt',
-    'TelnaesAnn': 'tmate',
-    'TheArgyleSweater': 'tas',
-    'ThePinkPanther': 'tmpnk',
-    'TheWizardOfId': 'crwiz',
-    'TheWizardOfIdSpanish': 'crwis',
-    'ThInk': 'think',
-    'ThompsonMike': 'crmth',
-    'ThroughThickAndThin': 'cpthk',
-    'TinySepuku': 'tiny',
-    'Toby': 'toby',
-    'TolesTom': 'tt',
-    'TomTheDancingBug': 'td',
-    'TooMuchCoffeeMan': 'tmcm',
-    'Trevor': 'trev',
-    'TutelandiaSpanish': 'sptut',
-    'VarvelGary': 'crgva',
-    'WassermanDan': 'tmdwa',
-    'WatchYourHead': 'wpwyh',
-    'Waylay': 'min',
-    'WeePals': 'crwee',
-    'WinnieThePooh': 'crwin',
-    'WitOfTheWorld': 'cwwit',
-    'WorkingItOut': 'crwio',
-    'WriteDon': 'tmdow',
-    'YennySpanish': 'spyen',
-    'Yenny': 'yen',
-    'ZackHill': 'crzhi',
-    'ZiggySpanish': 'spzi',
-    'Ziggy': 'zi',
+    '9 Chickweed Lane': 'strip',
 }

-for name, shortname in comics.items():
-    add(name, shortname)
+for name, category in comics.items():
+    add(name, category)
--- a/dosagelib/plugins/y.py
+++ b/dosagelib/plugins/y.py
@ -2,6 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012 Bastian Kleineidam

+from re import compile
 from ..scraper import _BasicScraper
 from ..util import tagre

--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -22,6 +22,9 @@ class _BasicScraper(object):
    @cvar prevSearch: A compiled regex that will locate the URL for the
        previous strip when applied to a strip page.
    '''
+    # if more than one image per URL is expected
+    multipleImagesPerStrip = False
+    # usually the index format help
    help = 'Sorry, no help for this comic yet.'

    def __init__(self, indexes=None):
@ -44,7 +47,9 @@ class _BasicScraper(object):

    def getStrip(self, url):
        """Get comic strip for given URL."""
-        imageUrls = fetchUrls(url, self.imageSearch)
+        imageUrls = fetchUrls(url, self.imageSearch)[0]
+        if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
+            raise ValueError("found %d images with %s" % (len(imageUrls), self.imageSearch.pattern))
        return self.getComicStrip(url, imageUrls)

    def getComicStrip(self, url, imageUrls):
@ -140,11 +145,13 @@ def get_scrapers():
    """
    global _scrapers
    if _scrapers is None:
+        out.write("Loading comic modules...")
        modules = loader.get_modules()
        plugins = loader.get_plugins(modules, _BasicScraper)
        _scrapers = list(plugins)
        _scrapers.sort(key=lambda s: s.get_name())
        check_scrapers()
+        out.write("... %d modules loaded." % len(_scrapers))
    return _scrapers


--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -4,6 +4,7 @@
 from __future__ import division, print_function

 import urllib2, urlparse
+import requests
 import sys
 import os
 import cgi
@ -42,10 +43,6 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
    @return: the generated regular expression suitable for re.compile()
    @rtype: string
    """
-    if before:
-        before += "[^>]*"
-    if after:
-        after += "[^>]*"
    attrs = dict(
        tag=case_insensitive_re(tag),
        attribute=case_insensitive_re(attribute),
@ -54,7 +51,7 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
        before=before,
        after=after,
    )
-    return r'<\s*%(tag)s\s+(?:[^>]*%(before)s\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s>' % attrs
+    return r'<\s*%(tag)s\s+(?:[^>]*%(before)s[^>]*\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs


 def case_insensitive_re(name):
@ -74,7 +71,7 @@ baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
 def getPageContent(url):
    # read page data
    page = urlopen(url)
-    data = page.read(MAX_FILESIZE)
+    data = page.text
    # determine base URL
    baseUrl = None
    match = baseSearch.search(data)
@ -105,7 +102,7 @@ def fetchUrls(url, imageSearch, prevSearch=None):
        imageUrl = match.group(1)
        if not imageUrl:
            raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
-        out.write('matched image URL %r' % imageUrl, 2)
+        out.write('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern), 2)
        imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
    if not imageUrls:
        out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern))
@ -178,21 +175,17 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
    out.write('Open URL %s' % url, 2)
    assert retries >= 0, 'invalid retry value %r' % retries
    assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
-    req = urllib2.Request(url)
+    headers = {'User-Agent': UserAgent}
+    config = {"max_retries": retries}
    if referrer:
-        req.add_header('Referer', referrer)
-    req.add_header('User-Agent', UserAgent)
-    tries = 0
-    while True:
+        headers['Referer'] = referrer
    try:
-            return urllib2.urlopen(req)
-        except IOError as err:
+        req = requests.get(url, headers=headers, config=config)
+        req.raise_for_status()
+        return req
+    except requests.exceptions.RequestException as err:
        msg = 'URL retrieval of %s failed: %s' % (url, err)
        out.write(msg)
-            out.write('waiting %d seconds and retrying (%d)' % (retry_wait_seconds, tries), 2)
-            time.sleep(retry_wait_seconds)
-            tries += 1
-            if tries >= retries:
        raise IOError(msg)


@ -259,11 +252,9 @@ def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
    print("""********** Oops, I did it again. *************

 You have found an internal error in %(app)s. Please write a bug report
-at %(url)s and include the following information:
- your commandline arguments and any configuration file in ~/.dosage/
- the system information below
+at %(url)s and include at least the information below:

-Not disclosing some of the information above due to privacy reasons is ok.
+Not disclosing some of the information below due to privacy reasons is ok.
 I will try to help you nonetheless, but you have to give me something
 I can work with ;) .
 """ % dict(app=AppName, url=SupportUrl), file=out)
@ -308,6 +299,7 @@ def print_app_info(out=sys.stderr):
                    {"version": sys.version, "platform": sys.platform}, file=out)
    stime = strtime(time.time())
    print("Local time:", stime, file=out)
+    print("sys.argv", sys.argv, file=out)


 def strtime(t):
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+requests
+
--- a/tests/test_comics.py
+++ b/tests/test_comics.py
@ -4,6 +4,7 @@
 import tempfile
 import shutil
 import re
+import os
 from itertools import islice
 from unittest import TestCase
 from dosagelib import scraper
@ -16,6 +17,16 @@ class _ComicTester(TestCase):
    def setUp(self):
        self.name = self.scraperclass.get_name()
        self.url = self.scraperclass.starter()
+        # create a temporary directory for images
+        self.tmpdir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdir)
+
+    def get_saved_images(self):
+        """Get saved images."""
+        dirs = tuple(self.name.split('/'))
+        return os.listdir(os.path.join(self.tmpdir, *dirs))

    def test_comic(self):
        # Test a scraper. It must be able to traverse backward for
@ -23,7 +34,8 @@ class _ComicTester(TestCase):
        # on at least 4 pages.
        scraperobj = self.scraperclass()
        num = empty = 0
-        for strip in islice(scraperobj.getAllStrips(), 0, 5):
+        max_strips = 5
+        for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
            images = 0
            for image in strip.getImages():
                images += 1
@ -35,6 +47,15 @@ class _ComicTester(TestCase):
            num += 1
        if self.scraperclass.prevSearch:
            self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num)
+            # check that at exactly or for multiple pages at least 5 images are saved
+            saved_images = self.get_saved_images()
+            num_images = len(saved_images)
+            if self.scraperclass.multipleImagesPerStrip:
+                self.check(num_images >= max_strips, 
+                  'saved %d %s instead of at least %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
+            else:
+                self.check(num_images == max_strips, 
+                  'saved %d %s instead of %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
        self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty)

    def check_stripurl(self, strip):
@ -50,28 +71,28 @@ class _ComicTester(TestCase):
        self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))

    def save(self, image):
-        # create a temporary directory
-        tmpdir = tempfile.mkdtemp()
        try:
-            image.save(tmpdir)
+            image.save(self.tmpdir)
        except Exception as msg:
-            self.check(False, 'could not save %s to %s: %s' % (image.url, tmpdir, msg))
-        finally:
-            shutil.rmtree(tmpdir)
+            self.check(False, 'could not save %s to %s: %s' % (image.url, self.tmpdir, msg))

    def check(self, condition, msg):
        self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))


+def make_comic_tester(name, **kwargs):
+    """Create and return a _ComicTester class with given name and attributes."""
+    return type(name, (_ComicTester,), kwargs)
+
+
 def generate_comic_testers():
    """For each comic scraper, create a test class."""
+    g = globals()
    # Limit number of scraper tests for now
-    max_scrapers = 100
+    max_scrapers = 10000
    for scraperclass in islice(scraper.get_scrapers(), 0, max_scrapers):
        name = 'Test'+scraperclass.__name__
-        globals()[name] = type(name,
-            (_ComicTester,),
-            dict(scraperclass=scraperclass)
-        )
+        g[name] = make_comic_tester(name, scraperclass=scraperclass)
+

 generate_comic_testers()