Fix comics, improve tests, use python-requests.

2012-11-26 18:44:31 +01:00 · 2012-11-26 18:44:31 +01:00 · 0556ffd30a
commit 0556ffd30a
parent d4eee7719d
16 changed files with 191 additions and 403 deletions
--- a/doc/README.txt
+++ b/doc/README.txt
@ -40,10 +40,11 @@ manual page.
 Dependencies
 -------------
-Dosage requires Python version 2.7 or higher, which can be downloaded
+Python version 2.7 or higher, which can be downloaded
-from http://www.python.org.
+from http://www.python.org/
-No external Python modules are required - only the Python Standard Library
+
-that gets installed with Python.
+Also the python-requests module must be installed, which can be downloaded
 from http://docs.python-requests.org/en/latest/
 Installation
 -------------
@ -59,7 +60,7 @@ or if you do not have root permissions:
 Technical Description
 ----------------------
-Dosage is written entirely in Python and relies on regular expressions to
+Dosage is written in Python and relies on regular expressions to
 do most of the grunt work.
 For each webcomic Dosage has a plugin module, found in the "plugins"
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -4,6 +4,7 @@ Features:
 - cmdline: Added proper return codes for error conditions.
 - comics: Added more robust regular expressions for HTML tags.
  They match case insensitive and ignore whitespaces now.
 - comics: Use the python-requests module for HTTP requests.
 Changes:
 - installation: Added support for dynamic configuration values.
--- a/dosagelib/comic.py
+++ b/dosagelib/comic.py
@ -2,7 +2,6 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012 Bastian Kleineidam
 import urllib2
 import os
 import locale
 import rfc822
@ -55,18 +54,24 @@ class ComicImage(object):
        """Connect to host and get meta information."""
        try:
            self.urlobj = urlopen(self.url, referrer=self.referrer)
-        except urllib2.HTTPError as he:
+        except IOError as he:
            raise FetchComicError('Unable to retrieve URL.', self.url, he.code)
-        if self.urlobj.info().getmaintype() != 'image' and \
+        content_type = self.urlobj.headers.get('content-type')
-           self.urlobj.info().gettype() not in ('application/octet-stream', 'application/x-shockwave-flash'):
+        content_type = content_type.split(';', 1)[0]
        if '/' in content_type:
            maintype, subtype = content_type.split('/', 1)
        else:
            maintype = content_type
            subtype = None
        if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
            raise FetchComicError('No suitable image found to retrieve.', self.url)
        # Always use mime type for file extension if it is sane.
-        if self.urlobj.info().getmaintype() == 'image':
+        if maintype == 'image':
-            self.ext = '.' + self.urlobj.info().getsubtype().replace('jpeg', 'jpg')
+            self.ext = '.' + subtype.replace('jpeg', 'jpg')
-        self.contentLength = int(self.urlobj.info().get('content-length', 0))
+        self.contentLength = int(self.urlobj.headers.get('content-length', 0))
-        self.lastModified = self.urlobj.info().get('last-modified')
+        self.lastModified = self.urlobj.headers.get('last-modified')
        out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2)
    def touch(self, filename):
@ -88,7 +93,6 @@ class ComicImage(object):
        fn = os.path.join(comicDir, filename)
        if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
            self.urlobj.close()
            self.touch(fn)
            out.write('Skipping existing file "%s".' % (fn,), 1)
            return fn, False
@ -97,7 +101,7 @@ class ComicImage(object):
            out.write('Writing comic to file %s...' % (fn,), 3)
            with open(fn, 'wb') as comicOut:
                startTime = time.time()
-                comicOut.write(self.urlobj.read())
+                comicOut.write(self.urlobj.content)
                endTime = time.time()
            self.touch(fn)
        except:
@ -114,7 +118,5 @@ class ComicImage(object):
            attrs = dict(fn=fn, bytes=bytes, speed=speed)
            out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
            getHandler().comicDownloaded(self.name, fn)
        finally:
            self.urlobj.close()
        return fn, True
--- a/dosagelib/plugins/d.py
+++ b/dosagelib/plugins/d.py
@ -13,7 +13,7 @@ from ..util import tagre, getQueryParams
 class DMFA(_BasicScraper):
    latestUrl = 'http://www.missmab.com/'
    stripUrl = latestUrl + 'Comics/Vol_%s.php'
-    imageSearch = compile(tagre("img", "src", r'(Comics/|Vol)[^"]+)'))
+    imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
    prevSearch = compile(tagre("a", "href", r'([^"])+')+
      tagre("img", "src", r'(?:../)?Images/comicprev.gif'))
    help = 'Index format: nnn (normally, some specials)'
--- a/dosagelib/plugins/drunkduck.py
+++ b/dosagelib/plugins/drunkduck.py
@ -4,22 +4,29 @@
 from re import compile
 from ..scraper import make_scraper
-from ..helpers import bounceStarter, queryNamer
+from ..helpers import bounceStarter
 from ..util import tagre
 def add(name):
    classname = 'DrunkDuck_%s' % name
    url = 'http://www.drunkduck.com/%s/' % name
-    linkSearch = tagre("a", "href", r"(/[^/]*/index\.php\?p=\d+)", quote="'", after="The %s page")
+    linkSearch = tagre("a", "href", r"(/%s/\d+/)" % name)
    @classmethod
    def namer(cls, imageUrl, pageUrl):
        index = int(pageUrl.rstrip('/').split('/')[-1])
        ext = imageUrl.rsplit('.')[-1]
        return '%d.%s' % (index, ext)
    globals()[classname] = make_scraper(classname,
        name = 'DrunkDuck/' + name,
-        starter = bounceStarter(url, compile(linkSearch % 'next')),
+        starter = bounceStarter(url, compile(linkSearch + tagre("img", "class", "arrow_next"))),
-        stripUrl = url + 'index.php?p=%s' % name,
+        stripUrl = url + '%s/',
-        imageSearch = compile(tagre("img", "src", r"(http://[a-z0-9]*\.drunkduck\.com/[^/]*/pages/[^'/]+)", quote="'")),
+        imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image")),
-        prevSearch= compile(linkSearch % 'previous'),
+        prevSearch= compile(linkSearch + tagre("img", "class", "arrow_prev")),
        help = 'Index format: n (unpadded)',
-        namer = queryNamer('p', usePageUrl=True),
+        namer = namer,
    )
 comics = (
--- a/dosagelib/plugins/fallenangel.py
+++ b/dosagelib/plugins/fallenangel.py
@ -1,47 +1,26 @@
 # -*- coding: iso-8859-1 -*-
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012 Bastian Kleineidam
-from ..scraper import _BasicScraper
+
 from re import compile
 from ..scraper import make_scraper
 from ..util import asciify
-def fallenangel(name, shortname):
+def add(name, shortname):
-    pass # XXX
+    latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
-
+    classname = asciify(name)
-class _TheFallenAngel(_BasicScraper):
+    globals()[classname] = make_scraper(classname,
-    imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"')
+        latestUrl = latestUrl,
-    prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"')
+        stripUrl = latestUrl + '?date=%s',
-    help = 'Index format: yyyymmdd'
+        name='FallenAngel/' + name,
-
+        imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"'),
-    @property
+        prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"'),
-    def baseUrl(self):
+        help = 'Index format: yyyymmdd',
-        return 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % (self.shortName,)
+    )
    @property
    def stripUrl(self):
        return self.baseUrl + '?date=%s'
    def starter(self):
        return self.baseUrl
 class HighMaintenance(_TheFallenAngel):
    name = 'TheFallenAngel/HighMaintenance'
    shortName = 'hm'
 class FAWK(_TheFallenAngel):
    name = 'TheFallenAngel/FAWK'
    shortName = 'fawk'
 class MalloryChan(_TheFallenAngel):
    name = 'TheFallenAngel/MalloryChan'
    shortName = 'mallorychan'
 add('HighMaintenance', 'hm')
 add('FAWK', 'fawk')
 add('MalloryChan', 'mallorychan')
--- a/dosagelib/plugins/gocomics.py
+++ b/dosagelib/plugins/gocomics.py
@ -29,30 +29,30 @@ def add(name, repl=''):
 # http://www.gocomics.com/features
-# note that comics from creators.com are not repeated here
+# Duplicate comics from creators.com are commented out
 add('2 Cows and a Chicken')
 add('9 Chickweed Lane')
 add('9 to 5')
 add('The Academia Waltz')
 add('Adam at Home')
-add('Agnes')
+#add('Agnes')
 add('Alley Oop', repl='-')
 add('Andertoons')
-add('Andy Capp')
+#add('Andy Capp')
 add('Angry Little Girls', repl='-')
 add('Animal Crackers')
 add('Annie')
 add('The Argyle Sweater')
 add('Arlo and Janis')
-add('Ask Shagg')
+#add('Ask Shagg')
-add('BC')
+#add('BC')
 add('Back in the Day')
 add('Bad Reporter')
 add('Baldo')
-add('Ballard Street')
+#add('Ballard Street')
 add('Banana Triangle', repl='-')
 add('Barkeater Lake')
-add('The Barn')
+#add('The Barn')
 add('Barney and Clyde')
 add('Basic Instructions')
 add('Beardo')
@ -81,13 +81,13 @@ add('Brewster Rockit')
 add('Broom Hilda')
 add('The Buckets')
 add('Buni')
-add('Cafe con Leche')
+#add('Cafe con Leche')
 add('Calvin and Hobbes')
 add('Candorville')
 add('Cathy')
 add('Cest la Vie')
 add('Cheap Thrills Cuisine', repl='-')
-add('Chuckle Bros')
+#add('Chuckle Bros')
 add('Citizen Dog')
 add('The City')
 add('Cleats')
@ -99,15 +99,15 @@ add('Cow and Boy')
 add('CowTown')
 add('Crumb')
 add('Cul de Sac')
-add('Daddys Home')
+#add('Daddys Home')
 add('Dark Side of the Horse')
 add('Deep Cover')
-add('Diamond Lil')
+#add('Diamond Lil')
 add('Dick Tracy')
-add('The Dinette Set')
+#add('The Dinette Set')
 add('Dixie Drive', repl='-')
-add('Dog Eat Doug')
+#add('Dog Eat Doug')
-add('Dogs of C Kennel')
+#add('Dogs of C Kennel')
 add('Domestic Abuse')
 add('Doonesbury')
 add('The Doozies')
@ -122,18 +122,18 @@ add('F Minus')
 add('Family Tree')
 add('Farcus')
 add('Fat Cats', repl='-')
-add('Flo and Friends')
+#add('Flo and Friends')
 add('The Flying McCoys')
 add('Foolish Mortals', repl='-')
 add('For Better or For Worse')
-add('For Heavens Sake')
+#add('For Heavens Sake')
 add('Fort Knox')
 add('FoxTrot')
 add('FoxTrot Classics')
 add('Frank and Ernest')
 add('Frazz')
 add('Fred Basset')
-add('Free Range')
+#add('Free Range')
 add('Freshly Squeezed')
 add('Frog Applause')
 add('The Fusco Brothers')
@ -154,9 +154,9 @@ add('Haiku Ewe')
 add('Ham Shears')
 add('Health Capsules')
 add('Heart of the City')
-add('Heathcliff')
+#add('Heathcliff')
 add('Heavenly Nostrils')
-add('Herb and Jamaal')
+#add('Herb and Jamaal')
 add('Herman')
 add('Home and Away')
 add('HUBRIS!')
@ -184,7 +184,7 @@ add('La Cucaracha')
 add('Last Kiss')
 add('The LeftyBosco Picture Show')
 add('Legend of Bill')
-add('Liberty Meadows')
+#add('Liberty Meadows')
 add('Lil Abner')
 add('Lio')
 add('Little Dog Lost')
@ -201,7 +201,7 @@ add('Maintaining')
 add('Marias Day')
 add('Marmaduke')
 add('McArroni')
-add('The Meaning of Lila')
+#add('The Meaning of Lila')
 add('Medium Large')
 add('Meg Classics')
 add('The Middletons')
@ -209,7 +209,7 @@ add('Mike du Jour')
 add('Minimum Security')
 add('Moderately Confused')
 add('Molly and the Bear')
-add('Momma')
+#add('Momma')
 add('Monty')
 add('Motley Classics')
 add('Mr. Gigi and the Squid')
@ -217,7 +217,7 @@ add('Mutt and Jeff')
 add('My Cage')
 add('MythTickle')
 add('Nancy')
-add('Nest Heads')
+#add('Nest Heads')
 add('NEUROTICA')
 add('New Adventures of Queen Victoria')
 add('Non Sequitur')
@ -225,10 +225,10 @@ add('The Norm Classics')
 add('Nothing is Not Something')
 add('Off the Mark')
 add('Ollie and Quentin')
-add('On A Claire Day')
+#add('On A Claire Day')
-add('One Big Happy')
+#add('One Big Happy')
 add('Ordinary Bill')
-add('The Other Coast')
+#add('The Other Coast')
 add('Out of the Gene Pool Re-Runs')
 add('Over the Hedge')
 add('Overboard')
@ -254,10 +254,10 @@ add('Reply All')
 add('Rip Haywire')
 add('Ripleys Believe It or Not')
 add('Rose is Rose')
-add('Rubes')
+#add('Rubes')
 add('Rudy Park')
 add('Savage Chickens')
-add('Scary Gary')
+#add('Scary Gary')
 add('Shirley and Son Classics')
 add('Shoe')
 add('Shoecabbage')
@ -266,11 +266,11 @@ add('Skin Horse')
 add('Skippy')
 add('Slowpoke')
 add('Soup to Nutz')
-add('Speed Bump')
+#add('Speed Bump')
 add('Spot the Frog')
 add('Starslip')
 add('Stone Soup')
-add('Strange Brew')
+#add('Strange Brew')
 add('The Sunshine Club')
 add('Sylvia')
 add('Tank McNamara')
@ -280,7 +280,7 @@ add('Tales of TerraTopia')
 add('That is Priceless')
 add('Thats Life')
 add('Thatababy')
-add('Thin Lines')
+#add('Thin Lines')
 add('Tiny Sepuku')
 add('TOBY')
 add('Todays Dogg')
@ -293,12 +293,12 @@ add('Unstrange Phenomena')
 add('U.S. Acres')
 add('Viivi and Wagner')
 add('Watch Your Head')
-add('Wee Pals')
+#add('Wee Pals')
-add('Wizard of Id')
+#add('Wizard of Id')
 add('Working Daze')
-add('Working It Out')
+#add('Working It Out')
 add('W.T. Duck')
-add('Zack Hill')
+#add('Zack Hill')
 add('Ziggy')
 # http://www.gocomics.com/explore/editorial_list
--- a/dosagelib/plugins/keenspot.py
+++ b/dosagelib/plugins/keenspot.py
@ -18,9 +18,9 @@ def add(name, urls):
        name='KeenSpot/' + name,
        latestUrl=latestUrl,
        stripUrl=baseUrl + 'd/%s.html',
-        imageSearch = compile(tagre("img", "src", r'([^"]*comics/[^"]+)')),
+        imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')),
-        prevSearch = compile(tagre("a", "href", r'"([^"]*d/\d{8}\.html)') +
+        prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
-           '(?:<img[^>]+?(?:name="previous_day"|alt="Previous"|src="[^"]*back[^"]*")|Previous comic)'),
+           '(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')'),
        help = 'Index format: yyyymmdd',
    )
--- a/dosagelib/plugins/num.py
+++ b/dosagelib/plugins/num.py
@ -11,6 +11,7 @@ from ..scraper import _BasicScraper
 class NineteenNinetySeven(_BasicScraper):
    name = '1997'
    latestUrl = 'http://www.1977thecomic.com/'
    stripUrl = latestUrl + '%s'
    imageSearch = compile(tagre("img", "src", r'(http://www\.1977thecomic\.com/comics-1977/[^"]+)'))
    prevSearch = compile(tagre("a", "href", r'([^"]+)')+"Previous")
    help = 'Index format: yyyy/mm/dd/strip-name'
--- a/dosagelib/plugins/s.py
+++ b/dosagelib/plugins/s.py
@ -61,7 +61,7 @@ class Sheldon(_BasicScraper):
    latestUrl = 'http://www.sheldoncomics.com/'
    stripUrl = latestUrl + 'archive/%s.html'
    imageSearch = compile(tagre("img", "src", r'(/strips/[^"]+)'))
-    prevSearch = compile(tagre("a", "href", r'/archive/\d+\.html)', after="sidenav-prev"))
+    prevSearch = compile(tagre("a", "href", r'(/archive/\d+\.html)', after="sidenav-prev"))
    help = 'Index format: yymmdd'
--- a/dosagelib/plugins/uc.py
+++ b/dosagelib/plugins/uc.py
@ -1,280 +1,54 @@
 # -*- coding: iso-8859-1 -*-
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012 Bastian Kleineidam
-
+"""
-from re import compile, sub
+The Universal comics only have some samples, but those samples are always the newest ones.
 """
 import datetime
 from re import compile, escape
 from ..scraper import make_scraper
-from ..util import fetchUrl, tagre
+from ..util import tagre, asciify, getPageContent
-def add(name, shortName):
+def parse_strdate(strdate):
-    homepage = 'http://content.uclick.com/a2z.html'
+    """Parse date string. XXX this is locale dependant but it should not be."""
-    baseUrl = 'http://www.uclick.com/client/zzz/%s/'
+    return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
-    latestUrl = baseUrl % shortName
+
-    classname = 'UClick_%s' % name
+
 def add(name, category):
    shortname = name.replace(' ', '').lower()
    latestUrl = 'http://www.universaluclick.com/comics/%s/%s' % (category, shortname)
    classname = 'UClick_%s' % asciify(name)
    @classmethod
-    def fetchSubmodules(cls):
+    def namer(cls, imageUrl, pageUrl):
-        exclusions = ('index',)
+        """Parse publish date from page content which looks like:
-        # XXX refactor this mess
+         <img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
-        submoduleSearch = compile(tagre("a", "href", r'(http://content\.uclick\.com/content/\w+\.html)'))
+         <h4>published: Sunday, November 11, 2012</h4>
-        partsMatch = compile(tagre("a", "href", r'http://content\.uclick\.com/content/(\w+?)\.html'))
+        """
-        matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]
+        data = getPageContent(pageUrl)[0]
-        possibles = [partsMatch.match(match).groups() for match in matches]
+        ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
-
+        mo = ro.search(data)
-        def normalizeName(name):
+        if mo:
-            name = sub(r'&(.)acute;', r'\1', name).title()
+             strdate = mo.group(1)
-            return ''.join([c for c in name if c.isalnum()])
+             return parse_strdate(strdate).strftime("%Y%m%d")
        def fetchSubmodule(module):
            try:
                return fetchUrl(cls.baseUrl % module, cls.imageSearch)
            except Exception:
                # XXX log error
                return False
        return [normalizeName(name) for part, name in possibles if part not in exclusions and fetchSubmodule(part)]
    globals()[classname] = make_scraper(classname,
        name='UClick/' + name,
        latestUrl = latestUrl,
        stripUrl = latestUrl + '%s/',
-        imageSearch = compile(tagre("img", "src", r'(http://synd\.imgsrv\.uclick\.com/comics/\w+/\d{4}/[^"]+\.gif)')),
+        imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published'),
-        prevSearch = compile(tagre("a", "href", r'(/client/zzz/\w+/\d{4}/\d{2}/\d{2}/)') + 'Previous date'),
+        multipleImagesPerStrip = True,
-        help = 'Index format: yyyy/mm/dd',
+        prevSearch = None,
        help = 'Index format: none',
        namer = namer,
    )
 # List is from http://www.universaluclick.com/comics/list
 comics = {
-    '5thWave': 'fw',
+    '9 Chickweed Lane': 'strip',
    '9To5': 'tmntf',
    'AdamHome': 'ad',
    'Agnes': 'cragn',
    'AlcarazLalo': 'la',
    'AlcarazLaloSpanish': 'spla',
    'AndersonNick': 'wpnan',
    'AndyCapp': 'crcap',
    'AnimalCrackers': 'tmani',
    'Annie': 'tmann',
    'AsayChuck': 'crcas',
    'AskShagg': 'crask',
    'AuthTony': 'ta',
    'BadReporter': 'bad',
    'Baldo': 'ba',
    'BaldoSpanish': 'be',
    'BallardStreet': 'crbal',
    'BarkEaterLake': 'bark',
    'BarstowDonna': 'dba',
    'BC': 'crbc',
    'BCSpanish': 'crbcs',
    'BeattieBruce': 'crbbe',
    'BennetClay': 'wpcbe',
    'BensonLisa': 'wplbe',
    'BensonSteve': 'crsbe',
    'BigTop': 'bt',
    'Biographic': 'biov',
    'Bleeker': 'blk',
    'BobTheSquirrel': 'bob',
    'BoilingPoint': 'boil',
    'BokChip': 'crcbo',
    'BoNanas': 'bon',
    'Boomerangs': 'boom',
    'BoondocksThe': 'bo',
    'BottomLiners': 'tmbot',
    'BoundAndGagged': 'tmbou',
    'Brainwaves': 'bwv',
    'BreenSteve': 'crsbr',
    'BrendaStarr': 'tmbre',
    'BrewsterRockit': 'tmrkt',
    'BrittChris': 'crcbr',
    'BroomHilda': 'tmbro',
    'Candorville': 'cand',
    'CarlsonStuart': 'sc',
    'CatalinoKen': 'crkca',
    'Cathy': 'ca',
    'CathySpanish': 'spca',
    'CEstLaVie': 'clv',
    'CityThe': 'derf',
    'ClearBlueWater': 'cbw',
    'Cleats': 'cle',
    'CloseToHome': 'cl',
    'CombsPaul': 'tmcmb',
    'CompuToon': 'tmcom',
    'Condorito': 'cond',
    'ConradPaul': 'tmpco',
    'Cornered': 'co',
    'CulDeSac': 'cds',
    'DanzigerJeff': 'jd',
    'DaviesMatt': 'tmmda',
    'DeepCover': 'deep',
    'DeeringJohn': 'crjde',
    'DickTracy': 'tmdic',
    'DinetteSetThe': 'crdin',
    'DogEatDoug': 'crdog',
    'DonWright': 'tmdow',
    'Doodles': 'tmdoo',
    'Doonesbury': 'db',
    'DuplexThe': 'dp',
    'Eek': 'eek',
    'ElderberriesThe': 'eld',
    'FacesInTheNews': 'kw',
    'FlightDeck': 'crfd',
    'FloAndFriends': 'crflo',
    'FlyingMccoysThe': 'fmc',
    'ForBetterOrForWorse': 'fb',
    'ForHeavenSSake': 'crfhs',
    'FoxtrotClassics': 'ftcl',
    'Foxtrot': 'ft',
    'FoxtrotSpanish': 'spft',
    'FrankAndErnest': 'fa',
    'FredBassetSpanish': 'spfba',
    'FredBasset': 'tmfba',
    'FrogApplause': 'frog',
    'FuscoBrothersThe': 'fu',
    'Garfield': 'ga',
    'GarfieldSpanish': 'gh',
    'GasolineAlley': 'tmgas',
    'GaturroSpanish': 'spgat',
    'GilThorp': 'tmgil',
    'GingerMeggs': 'gin',
    'GingerMeggsSpanish': 'spgin',
    'GirlsAndSports': 'crgis',
    'GorrellBob': 'crbgo',
    'GoTeamBob': 'gtb',
    'HammondBruce': 'hb',
    'HandelsmanWalt': 'tmwha',
    'HeartOfTheCity': 'hc',
    'Heathcliff': 'crhea',
    'HeathcliffSpanish': 'crhes',
    'HerbAndJamaal': 'crher',
    'HigginsJack': 'jh',
    'HomeAndAway': 'wphaa',
    'HorseyDavid': 'tmdho',
    'Housebroken': 'tmhou',
    'HubertAndAbby': 'haa',
    'IdiotBox': 'ibox',
    'ImagineThis': 'imt',
    'InkPen': 'ink',
    'InTheBleachers': 'bl',
    'ItsAllAboutYou': 'wpiay',
    'JamesBondSpanish': 'spjb',
    'JonesClay': 'crcjo',
    'KallaugherKevin': 'cwkal',
    'KChroniclesThe': 'kk',
    'KelleySteve': 'crske',
    'Kudzu': 'tmkud',
    'LaCucaracha': 'lc',
    'LegendOfBill': 'lob',
    'LibertyMeadows': 'crlib',
    'Lio': 'lio',
    'LittleDogLost': 'wpldl',
    'LocherDick': 'tmdlo',
    'LooseParts': 'tmloo',
    'LostSheep': 'lost',
    'LoweChan': 'tmclo',
    'LuckovichMike': 'crmlu',
    'LuckyCow': 'luc',
    'MarkstienGary': 'crgma',
    'MarletteDoug': 'tmdma',
    'MccoyGlenn': 'gm',
    'MeaningOfLilaThe': 'crlil',
    'MeehanStreak': 'tmmee',
    'MiddletonsThe': 'tmmid',
    'MinimumSecurity': 'ms',
    'ModestyBlaiseSpanish': 'spmb',
    'Momma': 'crmom',
    'MorinJim': 'cwjmo',
    'MuttJeffSpanish': 'spmut',
    'MythTickle': 'myth',
    'NAoQV': 'naqv',
    'NaturalSelection': 'crns',
    'NestHeads': 'cpnst',
    'Neurotica': 'neu',
    'NonSequitur': 'nq',
    'OhmanJack': 'tmjoh',
    'OliphantPat': 'po',
    'OnAClaireDay': 'crocd',
    'OneBigHappy': 'crobh',
    'OtherCoastThe': 'crtoc',
    'OutOfTheGenePool': 'wpgen',
    'Overboard': 'ob',
    'OverboardSpanish': 'spob',
    'PepeSpanish': 'sppep',
    'PettJoel': 'jp',
    'Pibgorn': 'pib',
    'Pickles': 'wppic',
    'Pluggers': 'tmplu',
    'PoochCafe': 'poc',
    'PoochCafeSpanish': 'sppoc',
    'PopCulture': 'pop',
    'PowellDwane': 'crdpo',
    'Preteena': 'pr',
    'PricklyCity': 'prc',
    'QuigmansThe': 'tmqui',
    'RallComic': 'tr',
    'RamirezMicheal': 'crmrm',
    'RamseyMarshall': 'crmra',
    'RealLifeAdventures': 'rl',
    'RedAndRover': 'wpred',
    'RedMeat': 'red',
    'ReynoldsUnwrapped': 'rw',
    'RonaldinhoGaucho': 'ron',
    'RonaldinhoGauchoSpanish': 'spron',
    'Rubes': 'crrub',
    'SackSteve': 'tmssa',
    'SargentBen': 'bs',
    'SargentBenSpanish': 'spbs',
    'SendHelp': 'send',
    'ShenemanDrew': 'tmdsh',
    'SherffiusDrew': 'crjsh',
    'Shoecabbage': 'shcab',
    'Shoe': 'tmsho',
    'SigmundSpanish': 'spsig',
    'Slowpoke': 'slow',
    'SmallWorld': 'small',
    'SpaceIsThePlace': 'sitp',
    'SpeedBump': 'crspe',
    'StanisScott': 'crsst',
    'StateOfTheUnion': 'crsou',
    'StayskalWayne': 'tmwst',
    'StoneSoup': 'ss',
    'StrangeBrew': 'crstr',
    'SummersDana': 'tmdsu',
    'SuttonImpact': 'stn',
    'Sylvia': 'tmsyl',
    'SzepPaul': 'crpsz',
    'TankMcnamara': 'tm',
    'TeenageMutantNinjaTurtles': 'tmnt',
    'TelnaesAnn': 'tmate',
    'TheArgyleSweater': 'tas',
    'ThePinkPanther': 'tmpnk',
    'TheWizardOfId': 'crwiz',
    'TheWizardOfIdSpanish': 'crwis',
    'ThInk': 'think',
    'ThompsonMike': 'crmth',
    'ThroughThickAndThin': 'cpthk',
    'TinySepuku': 'tiny',
    'Toby': 'toby',
    'TolesTom': 'tt',
    'TomTheDancingBug': 'td',
    'TooMuchCoffeeMan': 'tmcm',
    'Trevor': 'trev',
    'TutelandiaSpanish': 'sptut',
    'VarvelGary': 'crgva',
    'WassermanDan': 'tmdwa',
    'WatchYourHead': 'wpwyh',
    'Waylay': 'min',
    'WeePals': 'crwee',
    'WinnieThePooh': 'crwin',
    'WitOfTheWorld': 'cwwit',
    'WorkingItOut': 'crwio',
    'WriteDon': 'tmdow',
    'YennySpanish': 'spyen',
    'Yenny': 'yen',
    'ZackHill': 'crzhi',
    'ZiggySpanish': 'spzi',
    'Ziggy': 'zi',
 }
-for name, shortname in comics.items():
+for name, category in comics.items():
-    add(name, shortname)
+    add(name, category)
--- a/dosagelib/plugins/y.py
+++ b/dosagelib/plugins/y.py
@ -2,6 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012 Bastian Kleineidam
 from re import compile
 from ..scraper import _BasicScraper
 from ..util import tagre
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -22,6 +22,9 @@ class _BasicScraper(object):
    @cvar prevSearch: A compiled regex that will locate the URL for the
        previous strip when applied to a strip page.
    '''
    # if more than one image per URL is expected
    multipleImagesPerStrip = False
    # usually the index format help
    help = 'Sorry, no help for this comic yet.'
    def __init__(self, indexes=None):
@ -44,7 +47,9 @@ class _BasicScraper(object):
    def getStrip(self, url):
        """Get comic strip for given URL."""
-        imageUrls = fetchUrls(url, self.imageSearch)
+        imageUrls = fetchUrls(url, self.imageSearch)[0]
        if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
            raise ValueError("found %d images with %s" % (len(imageUrls), self.imageSearch.pattern))
        return self.getComicStrip(url, imageUrls)
    def getComicStrip(self, url, imageUrls):
@ -140,11 +145,13 @@ def get_scrapers():
    """
    global _scrapers
    if _scrapers is None:
        out.write("Loading comic modules...")
        modules = loader.get_modules()
        plugins = loader.get_plugins(modules, _BasicScraper)
        _scrapers = list(plugins)
        _scrapers.sort(key=lambda s: s.get_name())
        check_scrapers()
        out.write("... %d modules loaded." % len(_scrapers))
    return _scrapers
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -4,6 +4,7 @@
 from __future__ import division, print_function
 import urllib2, urlparse
 import requests
 import sys
 import os
 import cgi
@ -42,10 +43,6 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
    @return: the generated regular expression suitable for re.compile()
    @rtype: string
    """
    if before:
        before += "[^>]*"
    if after:
        after += "[^>]*"
    attrs = dict(
        tag=case_insensitive_re(tag),
        attribute=case_insensitive_re(attribute),
@ -54,7 +51,7 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
        before=before,
        after=after,
    )
-    return r'<\s*%(tag)s\s+(?:[^>]*%(before)s\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s>' % attrs
+    return r'<\s*%(tag)s\s+(?:[^>]*%(before)s[^>]*\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
 def case_insensitive_re(name):
@ -74,7 +71,7 @@ baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
 def getPageContent(url):
    # read page data
    page = urlopen(url)
-    data = page.read(MAX_FILESIZE)
+    data = page.text
    # determine base URL
    baseUrl = None
    match = baseSearch.search(data)
@ -105,7 +102,7 @@ def fetchUrls(url, imageSearch, prevSearch=None):
        imageUrl = match.group(1)
        if not imageUrl:
            raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
-        out.write('matched image URL %r' % imageUrl, 2)
+        out.write('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern), 2)
        imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
    if not imageUrls:
        out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern))
@ -178,22 +175,18 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
    out.write('Open URL %s' % url, 2)
    assert retries >= 0, 'invalid retry value %r' % retries
    assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
-    req = urllib2.Request(url)
+    headers = {'User-Agent': UserAgent}
    config = {"max_retries": retries}
    if referrer:
-        req.add_header('Referer', referrer)
+        headers['Referer'] = referrer
-    req.add_header('User-Agent', UserAgent)
+    try:
-    tries = 0
+        req = requests.get(url, headers=headers, config=config)
-    while True:
+        req.raise_for_status()
-        try:
+        return req
-            return urllib2.urlopen(req)
+    except requests.exceptions.RequestException as err:
-        except IOError as err:
+        msg = 'URL retrieval of %s failed: %s' % (url, err)
-            msg = 'URL retrieval of %s failed: %s' % (url, err)
+        out.write(msg)
-            out.write(msg)
+        raise IOError(msg)
            out.write('waiting %d seconds and retrying (%d)' % (retry_wait_seconds, tries), 2)
            time.sleep(retry_wait_seconds)
            tries += 1
            if tries >= retries:
                raise IOError(msg)
 def get_columns (fp):
@ -259,11 +252,9 @@ def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
    print("""********** Oops, I did it again. *************
 You have found an internal error in %(app)s. Please write a bug report
-at %(url)s and include the following information:
+at %(url)s and include at least the information below:
 - your commandline arguments and any configuration file in ~/.dosage/
 - the system information below
-Not disclosing some of the information above due to privacy reasons is ok.
+Not disclosing some of the information below due to privacy reasons is ok.
 I will try to help you nonetheless, but you have to give me something
 I can work with ;) .
 """ % dict(app=AppName, url=SupportUrl), file=out)
@ -308,6 +299,7 @@ def print_app_info(out=sys.stderr):
                    {"version": sys.version, "platform": sys.platform}, file=out)
    stime = strtime(time.time())
    print("Local time:", stime, file=out)
    print("sys.argv", sys.argv, file=out)
 def strtime(t):
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 requests
--- a/tests/test_comics.py
+++ b/tests/test_comics.py
@ -4,6 +4,7 @@
 import tempfile
 import shutil
 import re
 import os
 from itertools import islice
 from unittest import TestCase
 from dosagelib import scraper
@ -16,6 +17,16 @@ class _ComicTester(TestCase):
    def setUp(self):
        self.name = self.scraperclass.get_name()
        self.url = self.scraperclass.starter()
        # create a temporary directory for images
        self.tmpdir = tempfile.mkdtemp()
    def tearDown(self):
        shutil.rmtree(self.tmpdir)
    def get_saved_images(self):
        """Get saved images."""
        dirs = tuple(self.name.split('/'))
        return os.listdir(os.path.join(self.tmpdir, *dirs))
    def test_comic(self):
        # Test a scraper. It must be able to traverse backward for
@ -23,7 +34,8 @@ class _ComicTester(TestCase):
        # on at least 4 pages.
        scraperobj = self.scraperclass()
        num = empty = 0
-        for strip in islice(scraperobj.getAllStrips(), 0, 5):
+        max_strips = 5
        for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
            images = 0
            for image in strip.getImages():
                images += 1
@ -35,6 +47,15 @@ class _ComicTester(TestCase):
            num += 1
        if self.scraperclass.prevSearch:
            self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num)
            # check that at exactly or for multiple pages at least 5 images are saved
            saved_images = self.get_saved_images()
            num_images = len(saved_images)
            if self.scraperclass.multipleImagesPerStrip:
                self.check(num_images >= max_strips, 
                  'saved %d %s instead of at least %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
            else:
                self.check(num_images == max_strips, 
                  'saved %d %s instead of %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
        self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty)
    def check_stripurl(self, strip):
@ -50,28 +71,28 @@ class _ComicTester(TestCase):
        self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))
    def save(self, image):
        # create a temporary directory
        tmpdir = tempfile.mkdtemp()
        try:
-            image.save(tmpdir)
+            image.save(self.tmpdir)
        except Exception as msg:
-            self.check(False, 'could not save %s to %s: %s' % (image.url, tmpdir, msg))
+            self.check(False, 'could not save %s to %s: %s' % (image.url, self.tmpdir, msg))
        finally:
            shutil.rmtree(tmpdir)
    def check(self, condition, msg):
        self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))
 def make_comic_tester(name, **kwargs):
    """Create and return a _ComicTester class with given name and attributes."""
    return type(name, (_ComicTester,), kwargs)
 def generate_comic_testers():
    """For each comic scraper, create a test class."""
    g = globals()
    # Limit number of scraper tests for now
-    max_scrapers = 100
+    max_scrapers = 10000
    for scraperclass in islice(scraper.get_scrapers(), 0, max_scrapers):
        name = 'Test'+scraperclass.__name__
-        globals()[name] = type(name,
+        g[name] = make_comic_tester(name, scraperclass=scraperclass)
-            (_ComicTester,),
+
            dict(scraperclass=scraperclass)
        )
 generate_comic_testers()