A lot of refactoring.

2012-10-11 12:03:12 +02:00 · 2012-10-11 12:03:12 +02:00 · c707aa893d
commit c707aa893d
parent 4ba973abf5
37 changed files with 472 additions and 551 deletions
--- a/290
+++ b/290
@ -20,7 +20,6 @@
 import sys
 import os
 import optparse
-import traceback

 from dosagelib import events, scraper
 from dosagelib.output import out
@ -35,13 +34,13 @@ def setupOptions():
    usage = 'usage: %prog [options] comicModule [comicModule ...]'
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-v', '--verbose', action='count', dest='verbose', default=0, help='provides verbose output, use multiple times for more verbosity')
-    parser.add_option('-c', '--catch-up', action='count', dest='catchup', default=None, help='traverse and retrieve all available comics up until the strip that already exists locally, use twice to retrieve until all strips exist locally')
-    parser.add_option('-b', '--base-path', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH')
-    parser.add_option('--base-url', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH')
+    parser.add_option('-c', '--catchup', action='count', dest='catchup', default=None, help='traverse and retrieve all available comics up until the strip that already exists locally, use twice to retrieve until all strips exist locally')
+    parser.add_option('-b', '--basepath', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH')
+    parser.add_option('--baseurl', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH')
    parser.add_option('-l', '--list', action='store_const', const=1, dest='list', help='list available comic modules')
-    parser.add_option('--single-list', action='store_const', const=2, dest='list', help='list available comic modules in a single list')
+    parser.add_option('--singlelist', action='store_const', const=2, dest='list', help='list available comic modules in a single list')
    parser.add_option('-V', '--version', action='store_true', dest='version', help='display the version number')
-    parser.add_option('-m', '--module-help', action='store_true', dest='modhelp', help='display help for comic modules')
+    parser.add_option('-m', '--modulehelp', action='store_true', dest='modhelp', help='display help for comic modules')
    parser.add_option('-t', '--timestamps', action='store_true', dest='timestamps', default=False, help='print timestamps for all output at any info level')
    parser.add_option('-o', '--output', action='store', dest='output', choices=events.getHandlers(), help='output formatting for downloaded comics')
    if is_tty(sys.stdout):
@ -54,196 +53,129 @@ def displayVersion():
    print App
    print Copyright
    print Freeware
+    return 0


-class Dosage(object):
-    """Main program executing comic commands."""
+def setOutputInfo(options):
+    """Set global output level and timestamp option."""
+    out.level = 0
+    out.level += options.verbose
+    out.timestamps = options.timestamps

-    def __init__(self, settings):
-        """Store settings and initialize internal variables."""
-        self.settings = settings
-        self.errors = 0

-    def setOutputInfo(self):
-        """Set global output level and timestamp option."""
-        out.level = 0
-        out.level += self.settings['verbose']
-        out.timestamps = self.settings['timestamps']
+def saveComicStrip(strip, basepath, progress):
+    """Save a comic strip which can consist of multiple images."""
+    errors = 0
+    for image in strip.getImages():
+        try:
+            image.save(basepath, progress)
+        except IOError, msg:
+            out.write('Error saving %s: %s' % (image.filename, msg))
+            errors += 1
+    return errors

-    def saveComic(self, comic):
-        """Save one comic strip in an output file."""
-        basepath = self.settings['basepath']
-        progress = self.settings.get('progress', False)
-        fn, saved = comic.save(basepath, progress)
-        return saved

-    def saveComics(self, comics):
-        """Save a list of comics."""
-        saved = False
+def displayHelp(comics, basepath):
+    """Print help for comic strips."""
+    for scraperobj in getScrapers(comics, basepath):
+        for line in scraperobj.getHelp().splitlines():
+            out.write("Help: "+line)
+    return 0
+
+def getComics(options, comics):
+    errors = 0
+    events.installHandler(options.output, options.basepath, options.baseurl)
+    events.handler.start()
+    for scraperobj in getScrapers(comics, options.basepath):
+        out.context = scraperobj.get_name()
+        if options.catchup:
+            out.write('Catching up...')
+            strips = scraperobj.getAllStrips()
+        else:
+            out.write('Retrieving the current strip...')
+            strips = [scraperobj.getCurrentStrip()]
+        for strip in strips:
+            errors += saveComicStrip(strip, options.basepath, options.progress)
+    events.handler.end()
+    return errors
+
+
+def run(options, comics):
+    """Execute comic commands."""
+    setOutputInfo(options)
+    if options.version:
+        return displayVersion()
+    if options.list:
+        return doList(options.list == 1)
+    if len(comics) <= 0:
+        out.write('Warning: No comics specified, bailing out!')
+        return 1
+    if options.modhelp:
+        return displayHelp(comics, options.basepath)
+    errors = getComics(options, comics)
+
+
+def doList(columnList):
+    """List available comics."""
+    out.write('Available comic scrapers:')
+    scrapers = getScrapers(['@@'])
+    if columnList:
+        doColumnList(scrapers)
+    else:
+        doSingleList(scrapers)
+    out.write('%d supported comics.' % len(scrapers))
+    return 0
+
+
+def doSingleList(scrapers):
+    """Get list of scraper names, one per line."""
+    print '\n'.join(scraperobj.get_name() for scraperobj in scrapers)
+
+
+def doColumnList(scrapers):
+    """Get list of scraper names with multiple names per line."""
+    screenWidth = get_columns()
+    names = [scraperobj.get_name() for scraperobj in scrapers]
+    maxlen = max([len(name) for name in names])
+    namesPerLine = int(screenWidth / (maxlen + 1))
+    while names:
+        print ''.join([name.ljust(maxlen) for name in names[:namesPerLine]])
+        del names[:namesPerLine]
+
+
+def getScrapers(comics, basepath=None):
+    """Get scraper objects for the given comics."""
+    if '@' in comics:
+        # only scrapers whose directory already exists
+        if len(comics) > 1:
+            out.write("WARN: using '@' as comic name ignores all other specified comics.\n")
+        for scraperclass in scraper.get_scrapers():
+            dirname = scraperclass.get_name().replace('/', os.sep)
+            if os.path.isdir(os.path.join(basepath, dirname)):
+                yield scraperclass()
+    elif '@@' in comics:
+        # all scrapers
+        if len(comics) > 1:
+            out.write("WARN: using '@@' as comic name ignores all other specified comics.\n")
+        for scraperclass in scraper.get_scrapers():
+            yield scraperclass()
+    else:
+        # only selected
        for comic in comics:
-            saved = self.saveComic(comic) or saved
-        return saved
-
-    def safeOp(self, fp, *args, **kwargs):
-        """Run a function and catch and report any errors."""
-        try:
-            fp(*args, **kwargs)
-        except Exception:
-            self.errors += 1
-            type, value, tb = sys.exc_info()
-            out.write('Traceback (most recent call last):', 1)
-            out.writelines(traceback.format_stack(), 1)
-            out.writelines(traceback.format_tb(tb)[1:], 1)
-            out.writelines(traceback.format_exception_only(type, value))
-
-    def getCurrent(self):
-        """Retrieve and save all current comic strips."""
-        out.write('Retrieving the current strip...')
-        self.saveComics(self.module.getCurrentComics())
-
-    def getIndex(self, index):
-        """Retrieve comcis with given index."""
-        out.write('Retrieving index "%s"....' % (index,))
-        try:
-            self.module.setStrip(index)
-            self.saveComics(self.module.getNextComics())
-        except NotImplementedError:
-            out.write('No indexed retrieval support.')
-
-    def catchup(self):
-        """Save all comics until the current date."""
-        out.write('Catching up...')
-        for comics in self.module:
-            if not self.saveComics(comics) and self.settings['catchup'] < 2:
-                break
-
-    def catchupIndex(self, index):
-        """Retrieve and save all comics from the given index."""
-        out.write('Catching up from index "%s"...' % (index,))
-        self.module.setStrip(index)
-        for comics in self.module:
-            if not self.saveComics(comics) and self.settings['catchup'] < 2:
-                break
-
-    def getScrapers(self):
-        """Get list of scraper objects."""
-        return scraper.items()
-
-    def getExistingComics(self):
-        """Get all existing comic scrapers."""
-        for scraper in self.getScrapers():
-            dirname = scraper.get_name().replace('/', os.sep)
-            if os.path.isdir(os.path.join(self.settings['basepath'], dirname)):
-                yield scraper
-
-    def doList(self, columnList):
-        """List available comics."""
-        out.write('Available comic scrapers:')
-        scrapers = self.getScrapers()
-        if len(scrapers) > 0:
-            if columnList:
-                self.doColumnList(scrapers)
-            else:
-                self.doSingleList(scrapers)
-        out.write('%d supported comics.' % len(scrapers))
-
-    def doSingleList(self, scrapers):
-        """Get list of scraper names, one per line."""
-        print '\n'.join(scraper.get_name() for scraper in scrapers)
-
-    def doColumnList(self, scrapers):
-        """Get list of scraper names with multiple names per line."""
-        screenWidth = get_columns()
-        names = [scraper.get_name() for scraper in scrapers]
-        maxlen = max([len(name) for name in names])
-        namesPerLine = int(screenWidth / (maxlen + 1))
-        while names:
-            print ''.join([name.ljust(maxlen) for name in names[:namesPerLine]])
-            del names[:namesPerLine]
-
-    def doCatchup(self):
-        """Catchup comics."""
-        for comic in self.useComics():
-            if self.indices:
-                self.safeOp(self.catchupIndex, self.indices[0])
-            else:
-                self.safeOp(self.catchup)
-
-    def doCurrent(self):
-        """Get current comics."""
-        for comic in self.useComics():
-            if self.indices:
-                for index in self.indices:
-                    self.safeOp(self.getIndex, index)
-            else:
-                self.safeOp(self.getCurrent)
-
-    def doHelp(self):
-        """Print help for comic strips."""
-        for scraper in self.useComics():
-            for line in scraper.getHelp().splitlines():
-                out.write("Help: "+line)
-
-    def setupComic(self, scraper):
-        """Setup the internal comic module from given scraper."""
-        self.module = scraper()
-        out.context = scraper.get_name()
-        return self.module
-
-    def useComics(self):
-        """Set all comic modules for the defined comics."""
-        for comic in self.comics:
            c = comic.split(':', 2)
            if len(c) > 1:
-                self.indices = c[1].split(',')
+                indices = c[1].split(',')
            else:
-                self.indices = None
-
+                indices = None
            moduleName = c[0]
-            if moduleName == '@':
-                for s in self.getExistingComics():
-                    yield self.setupComic(s)
-            elif moduleName == '@@':
-                for s in self.getScrapers():
-                    yield self.setupComic(s)
-            else:
-                yield self.setupComic(scraper.get(moduleName))
-
-    def run(self, comics):
-        """Execute comic commands."""
-        self.setOutputInfo()
-        self.comics = comics
-
-        om = self.settings['output']
-        events.installHandler(om, self.settings['basepath'], self.settings['baseurl'])
-        events.handler.start()
-
-        if self.settings['version']:
-            displayVersion()
-        elif self.settings['list']:
-            self.doList(self.settings['list'] == 1)
-        elif len(comics) <= 0:
-            out.write('Warning: No comics specified, bailing out!')
-        elif self.settings['modhelp']:
-            self.doHelp()
-        elif self.settings['catchup']:
-            self.doCatchup()
-        else:
-            self.doCurrent()
-
-        events.handler.end()
+            yield scraper.get_scraper(moduleName)(indices=indices)

 def main():
    """Parse options and execute commands."""
    try:
        parser = setupOptions()
        options, args = parser.parse_args()
-        d = Dosage(options.__dict__)
-        d.run(args)
-        if d.errors:
-            res = 1
-        else:
-            res = 0
+        res = run(options, args)
    except KeyboardInterrupt:
        print "Aborted."
        res = 1
--- a/dosagelib/comic.py
+++ b/dosagelib/comic.py
@ -6,8 +6,6 @@ import locale
 import rfc822
 import time
 import shutil
-# XXX why is this done??
-locale.setlocale(locale.LC_ALL, '')

 from .output import out
 from .util import urlopen, saneDataSize, normaliseURL
@ -18,16 +16,34 @@ class FetchComicError(IOError):
    """Exception for comic fetching errors."""
    pass

-class Comic(object):
-    """Download and save a single comic."""
+class ComicStrip(object):
+    """A list of comic image URLs."""

-    def __init__(self, moduleName, url, referrer=None, filename=None):
+    def __init__(self, name, parentUrl, imageUrls, namer):
+        """Store the image URL list."""
+        self.name = name
+        self.parentUrl = parentUrl
+        self.imageUrls = imageUrls
+        self.namer = namer
+
+    def getImages(self):
+        """Get a list of image downloaders."""
+        for imageUrl in self.imageUrls:
+            yield self.getDownloader(normaliseURL(imageUrl))
+
+    def getDownloader(self, url):
+        filename = self.namer(url, self.parentUrl)
+        return ComicImage(self.name, self.parentUrl, url, filename)
+
+
+class ComicImage(object):
+    def __init__(self, name, referrer, url, filename):
        """Set URL and filename."""
-        self.moduleName = moduleName
-        self.url = normaliseURL(url)
+        self.name = name
        self.referrer = referrer
+        self.url = url
        if filename is None:
-            filename = url.split('/')[-1]
+            filename = url.rsplit('/')[1]
        self.filename, self.ext = os.path.splitext(filename)
        self.filename = self.filename.replace(os.sep, '_')
        self.ext = self.ext.replace(os.sep, '_')
@ -62,13 +78,13 @@ class Comic(object):
    def save(self, basepath, showProgress=False):
        """Save comic URL to filename on disk."""
        self.connect()
-        comicName, comicExt = self.filename, self.ext
+        filename = "%s%s" % (self.filename, self.ext)
        comicSize = self.contentLength
-        comicDir = os.path.join(basepath, self.moduleName.replace('/', os.sep))
+        comicDir = os.path.join(basepath, self.name.replace('/', os.sep))
        if not os.path.isdir(comicDir):
            os.makedirs(comicDir)

-        fn = os.path.join(comicDir, '%s%s' % (self.filename, self.ext))
+        fn = os.path.join(comicDir, filename)
        if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
            self.urlobj.close()
            self.touch(fn)
@ -76,10 +92,8 @@ class Comic(object):
            return fn, False

        try:
-            tmpFn = os.path.join(comicDir, '__%s%s' % (self.filename, self.ext))
-            out.write('Writing comic to temporary file %s...' % (tmpFn,), 3)
-            comicOut = file(tmpFn, 'wb')
-            try:
+            out.write('Writing comic to file %s...' % (fn,), 3)
+            with open(fn, 'wb') as comicOut:
                startTime = time.time()
                if showProgress:
                    def pollData():
@ -92,12 +106,12 @@ class Comic(object):
                else:
                    comicOut.write(self.urlobj.read())
                endTime = time.time()
-            finally:
-                comicOut.close()
-            out.write('Copying temporary file (%s) to %s...' % (tmpFn, fn), 3)
-            shutil.copy2(tmpFn, fn)
            self.touch(fn)
-
+        except:
+            if os.path.isfile(fn):
+                os.remove(fn)
+            raise
+        else:
            size = os.path.getsize(fn)
            bytes = locale.format('%d', size, True)
            if endTime != startTime:
@ -106,13 +120,8 @@ class Comic(object):
                speed = '???'
            attrs = dict(fn=fn, bytes=bytes, speed=speed)
            out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
-            handler.comicDownloaded(self.moduleName, fn)
-            self.urlobj.close()
+            handler.comicDownloaded(self.name, fn)
        finally:
-            try:
-                out.write('Removing temporary file %s...' % (tmpFn,), 3)
-                os.remove(tmpFn)
-            except:
-                pass
+            self.urlobj.close()

        return fn, True
--- a/dosagelib/configuration.py
+++ b/dosagelib/configuration.py
@ -9,11 +9,13 @@ AppName = configdata.name
 App = AppName+u" "+Version
 Author = configdata.author
 HtmlAuthor = Author.replace(u' ', u'&nbsp;')
-Copyright = u"Copyright (C) 2004-2008 "+Author
-HtmlCopyright = u"Copyright &copy; 2004-2008 "+HtmlAuthor
+Maintainer = configdata.maintainer
+HtmlMaintainer = Maintainer.replace(u' ', u'&nbsp;')
+Copyright = u"Copyright (C) 2004-2008 "+Author+u", (C) 2012 "+Maintainer
+HtmlCopyright = u"Copyright &copy; 2004-2008 "+HtmlAuthor+u", 2012 "+HtmlMaintainer
 Url = configdata.url
 SupportUrl = Url + u"/issues"
-Email = configdata.author_email
+Email = configdata.maintainer_email
 UserAgent = u"Mozilla/5.0 (compatible; %s/%s; +%s)" % (AppName, Version, Url)
 Freeware = AppName+u""" comes with ABSOLUTELY NO WARRANTY!
 This is free software, and you are welcome to redistribute it
--- a/dosagelib/helpers.py
+++ b/dosagelib/helpers.py
@ -2,112 +2,10 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012 Bastian Kleineidam
 import re
+import urlparse

-from .util import fetchUrl, fetchManyUrls, getQueryParams
-from .comic import Comic
-
-class _BasicScraper(object):
-    '''Base class with scrape functions for comics.
-
-    @type latestUrl: C{string}
-    @cvar latestUrl: The URL for the latest comic strip.
-    @type imageUrl: C{string}
-    @cvar imageUrl: A string that is interpolated with the strip index
-        to yield the URL for a particular strip.
-    @type imageSearch: C{regex}
-    @cvar imageSearch: A compiled regex that will locate the strip image URL
-        when applied to the strip page.
-    @type prevSearch: C{regex}
-    @cvar prevSearch: A compiled regex that will locate the URL for the
-        previous strip when applied to a strip page.
-    '''
-    referrer = None
-    help = 'Sorry, no help for this comic yet.'
-
-    def __init__(self):
-        """Initialize internal variables."""
-        self.currentUrl = None
-        self.urls = set()
-
-    def getReferrer(self, imageUrl, pageUrl):
-        """Return referrer for HTTP connection."""
-        return self.referrer or pageUrl or self.getLatestUrl()
-
-    def getComic(self, url, pageUrl):
-        """Get comic downloader for given URL and page."""
-        if not url:
-            return None
-        return Comic(self.get_name(), url, filename=self.getFilename(url, pageUrl), referrer=self.getReferrer(url, pageUrl))
-
-    def getCurrentComics(self):
-        """Get list of current comics."""
-        self.currentUrl = self.getLatestUrl()
-        comics = self.getNextComics()
-        if not comics:
-            raise ValueError("Could not find current comic.")
-        return comics
-
-    def getNextComics(self):
-        """Get all next comics."""
-        comics = []
-        while not comics and self.currentUrl and self.currentUrl not in self.urls:
-            comicUrlGroups, prevUrl = fetchManyUrls(self.currentUrl, [self.imageSearch, self.prevSearch])
-
-            if prevUrl:
-                prevUrl = prevUrl[0]
-            else:
-                prevUrl = None
-
-            for comicUrl in comicUrlGroups:
-                comics.append(self.getComic(comicUrl, self.currentUrl))
-
-            self.urls.update([self.currentUrl])
-            self.currentUrl = (prevUrl, None)[prevUrl in self.urls]
-        return comics
-
-    def setStrip(self, index):
-        """Set current comic strip URL."""
-        self.currentUrl = self.imageUrl % index
-
-    def getHelp(self):
-        """Return help text for this scraper."""
-        return self.help
-
-    def __iter__(self):
-        """Iterate through the strips, starting from the current one and going backward."""
-        if not self.currentUrl:
-            self.currentUrl = self.getLatestUrl()
-        comics = True
-        while comics:
-            comics = self.getNextComics()
-            if comics:
-                yield comics
-
-    @classmethod
-    def get_name(cls):
-        """Get scraper name."""
-        if hasattr(cls, 'name'):
-            return cls.name
-        return cls.__name__
-
-    @classmethod
-    def starter(cls):
-        """Get starter URL from where to scrape comic strips."""
-        return cls.latestUrl
-
-    @classmethod
-    def namer(cls, imageUrl, pageUrl):
-        """Return filename for given image and page URL."""
-        return None
-
-    def getFilename(self, imageUrl, pageUrl):
-        """Return filename for given image and page URL."""
-        return self.namer(imageUrl, pageUrl)
-
-    def getLatestUrl(self):
-        """Get starter URL from where to scrape comic strips."""
-        return self.starter()
-
+from .util import fetchUrl, getQueryParams
+from .scraper import _BasicScraper

 def queryNamer(paramName, usePageUrl=False):
    """Get name from URL query part."""
--- a/dosagelib/loader.py
+++ b/dosagelib/loader.py
@ -0,0 +1,74 @@
+# -*- coding: iso-8859-1 -*-
+
+import os
+import sys
+
+def get_modules(folder, importprefix):
+    """Find all valid modules in the plugins directory. A valid module
+    must have a .py extension, and is importable.
+    @return: all loaded valid modules
+    @rtype: iterator of module
+    """
+    for filename in get_importable_modules(folder):
+        try:
+            module = load_module(filename, importprefix)
+            if module is not None:
+                yield module
+        except StandardError, msg:
+            print "ERROR: could not load module %s: %s" % (filename, msg)
+
+
+def get_importable_modules(folder):
+    """Find all module files in the given folder that end witn '.py' and
+    don't start with an underscore.
+    @return module filenames
+    @rtype: iterator of string
+    """
+    for fname in os.listdir(folder):
+        if fname.endswith('.py') and not fname.startswith('_'):
+            yield os.path.join(folder, fname)
+
+
+def load_module(filename, importprefix):
+    """Load and return the module given by the filename.
+    Other exceptions than ImportError are not catched.
+    @return: loaded module or None on import errors
+    @rtype: module or None
+    """
+    name = os.path.splitext(os.path.basename(filename))[0]
+    modulename = "%s%s" % (importprefix, name)
+    __import__(modulename)
+    return sys.modules[modulename]
+
+
+def get_plugins(modules, classobj):
+    """Find all scrapers in all modules.
+    @param modules: the modules to search
+    @ptype modules: iterator of modules
+    @return: found scrapers
+    @rytpe: iterator of class objects
+    """
+    for module in modules:
+        for plugin in get_module_plugins(module, classobj):
+            yield plugin
+
+
+def get_module_plugins(module, classobj):
+    """Return all subclasses of _BasicScraper in the module.
+    If the module defines __all__, only those entries will be searched,
+    otherwise all objects not starting with '_' will be searched.
+    """
+    try:
+        names = module.__all__
+    except AttributeError:
+        names = [x for x in vars(module) if not x.startswith('_')]
+    for name in names:
+        try:
+            obj = getattr(module, name)
+        except AttributeError:
+            continue
+        try:
+            if issubclass(obj, classobj):
+                yield obj
+        except TypeError:
+            continue
--- a/dosagelib/plugins/a.py
+++ b/dosagelib/plugins/a.py
@ -2,14 +2,14 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, MULTILINE
 from ..util import tagre
-
-from ..helpers import _BasicScraper, regexNamer, bounceStarter, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import regexNamer, bounceStarter, indirectStarter


 class ALessonIsLearned(_BasicScraper):
    latestUrl = 'http://www.alessonislearned.com/'
    imageUrl = 'http://www.alessonislearned.com/lesson%s.html'
-    imageSearch = compile(tagre("img", "src", r"(cmx/.+?)"))
+    imageSearch = compile(tagre("img", "src", r"(cmx/lesson.+?)"))
    prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=.+?)")+r".+?previous")
    help = 'Index format: nnn'

--- a/dosagelib/plugins/b.py
+++ b/dosagelib/plugins/b.py
@ -2,7 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile

-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper


 class BadlyDrawnKitties(_BasicScraper):
--- a/dosagelib/plugins/c.py
+++ b/dosagelib/plugins/c.py
@ -2,18 +2,23 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile

-from ..helpers import (
-    _BasicScraper, constStarter, bounceStarter, indirectStarter)
-from ..util import getQueryParams
+from ..scraper import _BasicScraper
+from ..helpers import constStarter, bounceStarter, indirectStarter
+from ..util import tagre, getQueryParams


 class CalvinAndHobbes(_BasicScraper):
-    latestUrl = 'http://www.gocomics.com/calvinandhobbes/'
+    starter = bounceStarter('http://www.gocomics.com/calvinandhobbes/',
+      compile(tagre("a", "href", "(/calvinandhobbes/\d+/\d+/\d+)")+"Next feature</a>"))
    imageUrl = 'http://www.gocomics.com/calvinandhobbes/%s'
-    imageSearch = compile(r'src="(http://picayune\.uclick\.com/comics/ch/[^"]+\.gif)"')
-    prevSearch = compile(r'href="(.*?)"\s+onclick="[^"]*">Previous day</a>')
+    imageSearch = compile(tagre("img", "src", "(http://assets\.amuniversal\.com/[a-f0-9]+)"))
+    prevSearch = compile(tagre("a", "href", "(/calvinandhobbes/\d+/\d+/\d+)")+"Previous feature</a>")
    help = 'Index format: yyyy/mm/dd'

+    @classmethod
+    def namer(cls, imageUrl, pageUrl):
+        prefix, year, month, day = pageUrl.rsplit('/', 3)
+        return "%s%s%s.gif" % (year, month, day)


 class CandyCartoon(_BasicScraper):
--- a/dosagelib/plugins/d.py
+++ b/dosagelib/plugins/d.py
@ -2,7 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE, MULTILINE

-from ..helpers import _BasicScraper, bounceStarter, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter, indirectStarter
 from ..util import getQueryParams


--- a/dosagelib/plugins/drunkduck.py
+++ b/dosagelib/plugins/drunkduck.py
@ -2,7 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper, bounceStarter, queryNamer
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter, queryNamer


 def drunkDuck(shortName):
--- a/dosagelib/plugins/e.py
+++ b/dosagelib/plugins/e.py
@ -2,7 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper, indirectStarter
+from ..helpers import indirectStarter
+from ..scraper import _BasicScraper


 class EerieCuties(_BasicScraper):
--- a/dosagelib/plugins/f.py
+++ b/dosagelib/plugins/f.py
@ -1,9 +1,10 @@
 # -*- coding: iso-8859-1 -*-
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE, MULTILINE
-from ..util import tagre

-from ..helpers import _BasicScraper, indirectStarter
+from ..util import tagre
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter


 class FalconTwin(_BasicScraper):
--- a/dosagelib/plugins/g.py
+++ b/dosagelib/plugins/g.py
@ -2,7 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile

-from ..helpers import _BasicScraper, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter


 class Galaxion(_BasicScraper):
--- a/dosagelib/plugins/h.py
+++ b/dosagelib/plugins/h.py
@ -2,7 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile

-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper


 class HappyMedium(_BasicScraper):
--- a/dosagelib/plugins/i.py
+++ b/dosagelib/plugins/i.py
@ -2,7 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper


 class IDreamOfAJeanieBottle(_BasicScraper):
--- a/dosagelib/plugins/j.py
+++ b/dosagelib/plugins/j.py
@ -2,8 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, MULTILINE

-from ..helpers import _BasicScraper
-
+from ..scraper import _BasicScraper


 class Jack(_BasicScraper):
--- a/dosagelib/plugins/k.py
+++ b/dosagelib/plugins/k.py
@ -2,8 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper
-
+from ..scraper import _BasicScraper


 class KernelPanic(_BasicScraper):
--- a/dosagelib/plugins/keenspot.py
+++ b/dosagelib/plugins/keenspot.py
@ -2,7 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper


 def keenSpot(comics):
--- a/dosagelib/plugins/l.py
+++ b/dosagelib/plugins/l.py
@ -2,8 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile

-from ..helpers import _BasicScraper, indirectStarter
-
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter


 class LasLindas(_BasicScraper):
--- a/dosagelib/plugins/m.py
+++ b/dosagelib/plugins/m.py
@ -2,7 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper, queryNamer
+from ..scraper import _BasicScraper
+from ..helpers import queryNamer


 class MadamAndEve(_BasicScraper):
--- a/dosagelib/plugins/n.py
+++ b/dosagelib/plugins/n.py
@ -2,7 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper, indirectStarter, _PHPScraper
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter, _PHPScraper



--- a/dosagelib/plugins/num.py
+++ b/dosagelib/plugins/num.py
@ -2,8 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile

-from ..helpers import _BasicScraper
-
+from ..scraper import _BasicScraper


 class NineteenNinetySeven(_BasicScraper):
--- a/dosagelib/plugins/o.py
+++ b/dosagelib/plugins/o.py
@ -2,7 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter


 class OctopusPie(_BasicScraper):
--- a/dosagelib/plugins/p.py
+++ b/dosagelib/plugins/p.py
@ -2,7 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper, bounceStarter, queryNamer
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter, queryNamer


 class PartiallyClips(_BasicScraper):
--- a/dosagelib/plugins/q.py
+++ b/dosagelib/plugins/q.py
@ -2,8 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile

-from ..helpers import _BasicScraper
-
+from ..scraper import _BasicScraper


 class QuestionableContent(_BasicScraper):
--- a/dosagelib/plugins/r.py
+++ b/dosagelib/plugins/r.py
@ -2,8 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile

-from ..helpers import _BasicScraper, bounceStarter
-
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter


 class RadioactivePanda(_BasicScraper):
--- a/dosagelib/plugins/s.py
+++ b/dosagelib/plugins/s.py
@ -3,7 +3,8 @@
 from re import compile, MULTILINE, IGNORECASE, sub
 from os.path import splitext

-from ..helpers import _BasicScraper, bounceStarter, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter, indirectStarter


 class SailorsunOrg(_BasicScraper):
--- a/dosagelib/plugins/t.py
+++ b/dosagelib/plugins/t.py
@ -2,8 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper, indirectStarter
-
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter


 class TalesOfPylea(_BasicScraper):
--- a/dosagelib/plugins/u.py
+++ b/dosagelib/plugins/u.py
@ -2,7 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE

-from ..helpers import _BasicScraper, bounceStarter, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter, indirectStarter
 from ..util import getQueryParams


--- a/dosagelib/plugins/uc.py
+++ b/dosagelib/plugins/uc.py
@ -2,8 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE, sub

-from ..helpers import _BasicScraper
-from ..util import fetchManyMatches, fetchUrl
+from ..scraper import _BasicScraper
+from ..util import fetchUrl


 class _UClickScraper(_BasicScraper):
@ -24,6 +24,7 @@ class _UClickScraper(_BasicScraper):
            'index',
            )

+        # XXX refactor this mess
        submoduleSearch = compile(r'(<A HREF="http://content.uclick.com/content/\w+.html">[^>]+?</a>)', IGNORECASE)
        partsMatch = compile(r'<A HREF="http://content.uclick.com/content/(\w+?).html">([^>]+?)</a>', IGNORECASE)
        matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]
--- a/dosagelib/plugins/v.py
+++ b/dosagelib/plugins/v.py
@ -2,8 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE, MULTILINE

-from ..helpers import _BasicScraper
-
+from ..scraper import _BasicScraper


 class _VGCats(_BasicScraper):
--- a/dosagelib/plugins/w.py
+++ b/dosagelib/plugins/w.py
@ -2,7 +2,8 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, IGNORECASE, DOTALL

-from ..helpers import _BasicScraper, queryNamer, bounceStarter
+from ..scraper import _BasicScraper
+from ..helpers import queryNamer, bounceStarter


 class WayfarersMoon(_BasicScraper):
--- a/dosagelib/plugins/x.py
+++ b/dosagelib/plugins/x.py
@ -2,7 +2,9 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile

-from ..helpers import _BasicScraper, bounceStarter
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter
+

 class xkcd(_BasicScraper):
    starter = bounceStarter('http://xkcd.com/', compile(r'<a rel="next" href="(/?\d+/?)"[^>]*>Next'))
--- a/dosagelib/plugins/y.py
+++ b/dosagelib/plugins/y.py
@ -2,7 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile, MULTILINE

-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper


 class YAFGC(_BasicScraper):
--- a/dosagelib/plugins/z.py
+++ b/dosagelib/plugins/z.py
@ -2,7 +2,7 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 from re import compile

-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper


 class Zapiro(_BasicScraper):
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -2,47 +2,122 @@
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012 Bastian Kleineidam
 import os
-import sys
-
-from .helpers import _BasicScraper
+from . import loader
+from .util import fetchUrls
+from .comic import ComicStrip

 disabled = []
 def init_disabled():
    filename = os.path.expanduser('~/.dosage/disabled')
-    if not os.path.isfile(filename):
-        return
-    with open(filename) as f:
-        for line in f:
-            if line and not line.startswith('#'):
-                disabled.append(line.rstrip())
+    if os.path.isfile(filename):
+        with open(filename) as f:
+            for line in f:
+                if line and not line.startswith('#'):
+                    disabled.append(line.rstrip())
 init_disabled()

 class DisabledComicError(ValueError):
    pass


-def get(comicName):
+class _BasicScraper(object):
+    '''Base class with scrape functions for comics.
+
+    @type latestUrl: C{string}
+    @cvar latestUrl: The URL for the latest comic strip.
+    @type imageUrl: C{string}
+    @cvar imageUrl: A string that is interpolated with the strip index
+        to yield the URL for a particular strip.
+    @type imageSearch: C{regex}
+    @cvar imageSearch: A compiled regex that will locate the strip image URL
+        when applied to the strip page.
+    @type prevSearch: C{regex}
+    @cvar prevSearch: A compiled regex that will locate the URL for the
+        previous strip when applied to a strip page.
+    '''
+    help = 'Sorry, no help for this comic yet.'
+
+    def __init__(self, indices=None):
+        """Initialize internal variables."""
+        self.urls = set()
+        self.indices = indices
+
+    def getCurrentStrip(self):
+        """Get current comic strip."""
+        return self.getStrip(self.getLatestUrl())
+
+    def getStrip(self, url):
+        """Get comic strip for given URL."""
+        imageUrls = fetchUrls(url, self.imageSearch)
+        return self.getComicStrip(url, imageUrls)
+
+    def getComicStrip(self, url, imageUrls):
+        """Get comic strip downloader for given URL and images."""
+        return ComicStrip(self.get_name(), url, imageUrls, self.namer)
+
+    def getAllStrips(self):
+        """Get all comic strips."""
+        seen_urls = set()
+        url = self.getLatestUrl()
+        while url:
+            imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
+            seen_urls.add(url)
+            yield self.getComicStrip(url, imageUrls)
+            # avoid recursive URL loops
+            url = prevUrl if prevUrl not in seen_urls else None
+
+    def setStrip(self, index):
+        """Set current comic strip URL."""
+        self.currentUrl = self.imageUrl % index
+
+    def getHelp(self):
+        """Return help text for this scraper."""
+        return self.help
+
+    @classmethod
+    def get_name(cls):
+        """Get scraper name."""
+        if hasattr(cls, 'name'):
+            return cls.name
+        return cls.__name__
+
+    @classmethod
+    def starter(cls):
+        """Get starter URL from where to scrape comic strips."""
+        return cls.latestUrl
+
+    @classmethod
+    def namer(cls, imageUrl, pageUrl):
+        """Return filename for given image and page URL."""
+        return None
+
+    def getFilename(self, imageUrl, pageUrl):
+        """Return filename for given image and page URL."""
+        return self.namer(imageUrl, pageUrl)
+
+    def getLatestUrl(self):
+        """Get starter URL from where to scrape comic strips."""
+        return self.starter()
+
+
+def get_scraper(comic):
    """Returns a comic module object."""
    candidates = []
-    for scraper in get_scrapers():
-        lname = scraper.get_name().lower()
-        cname = comicName.lower()
+    cname = comic.lower()
+    for scraperclass in get_scrapers():
+        lname = scraperclass.get_name().lower()
        if lname == cname:
            # perfect match
-            return scraper
+            return scraperclass
        if cname in lname:
-            candidates.append(scraper)
+            candidates.append(scraperclass)
    if len(candidates) == 1:
        return candidates[0]
    elif candidates:
        comics = ", ".join(x.get_name() for x in candidates)
        raise ValueError('Multiple comics %s found.' % comics)
    else:
-        raise ValueError('Comic %r not found.' % comicName)
-
-
-def items():
-    return get_scrapers()
+        raise ValueError('Comic %r not found.' % comic)


 _scrapers = None
@ -54,91 +129,23 @@ def get_scrapers():
    """
    global _scrapers
    if _scrapers is None:
-        _scrapers = list(get_all_plugins(get_modules()))
+        folder = os.path.join(os.path.dirname(__file__), 'plugins')
+        importprefix = 'dosagelib.plugins.'
+        modules = loader.get_modules(folder, importprefix)
+        plugins = loader.get_plugins(modules, _BasicScraper)
+        _scrapers = list(plugins)
        _scrapers.sort(key=lambda s: s.get_name())
        check_scrapers()
    return _scrapers


 def check_scrapers():
+    """Check for duplicate scraper class names."""
    d = {}
-    for s in _scrapers:
-        name = s.get_name().lower()
+    for scraperclass in _scrapers:
+        name = scraperclass.get_name().lower()
        if name in d:
-            name1 = s.get_name()
+            name1 = scraperclass.get_name()
            name2 = d[name].get_name()
            raise ValueError('Duplicate scrapers %s and %s found' % (name1, name2))
-        d[name] = s
-
-
-def get_modules():
-    """Find all valid modules in the plugins directory. A valid module
-    must have a .py extension, and is importable.
-    @return: all loaded valid modules
-    @rtype: iterator of module
-    """
-    # load from the plugins folder
-    folder = os.path.join(os.path.dirname(__file__), 'plugins')
-    for filename in get_importable_modules(folder):
-        try:
-            module = load_module(filename)
-            if module is not None:
-                yield module
-        except StandardError, msg:
-            print "ERROR", msg
-
-
-def get_importable_modules(folder):
-    """Find all module files in the given folder that end witn '.py' and
-    don't start with an underscore.
-    @return module filenames
-    @rtype: iterator of string
-    """
-    for fname in os.listdir(folder):
-        if fname.endswith('.py') and not fname.startswith('_'):
-            yield os.path.join(folder, fname)
-
-
-def load_module(filename):
-    """Load and return the module given by the filename.
-    Other exceptions than ImportError are not catched.
-    @return: loaded module or None on import errors
-    @rtype: module or None
-    """
-    name = os.path.splitext(os.path.basename(filename))[0]
-    modulename = "dosagelib.plugins.%s" % name
-    __import__(modulename)
-    return sys.modules[modulename]
-
-
-def get_all_plugins(modules):
-    """Find all scrapers in all modules.
-    @param modules: the modules to search
-    @ptype modules: iterator of modules
-    @return: found scrapers
-    @rytpe: iterator of class objects
-    """
-    for module in modules:
-        for plugin in get_plugins(module):
-            yield plugin
-
-
-def get_plugins(module):
-    """Return all subclasses of _BasicScraper in the module.
-    If the module defines __all__, only those entries will be searched,
-    otherwise all objects not starting with '_' will be searched.
-    """
-    try:
-        names = module.__all__
-    except AttributeError:
-        names = [x for x in vars(module) if not x.startswith('_')]
-    for name in names:
-        try:
-            obj = getattr(module, name)
-        except AttributeError:
-            continue
-        try:
-            if issubclass(obj, _BasicScraper):
-                yield obj
-        except TypeError:
-            continue
+        d[name] = scraperclass
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -21,72 +21,90 @@ if os.name == 'nt':

 has_curses = has_module("curses")

-class NoMatchError(Exception):
-    pass
+MAX_FILESIZE = 1024*1024*1 # 1MB

-def getMatchValues(matches):
-    return set([match.group(1) for match in matches])
+def tagre(tag, attribute, value):
+    """Return a regular expression matching the given HTML tag, attribute
+    and value. It matches the tag and attribute names case insensitive,
+    and skips arbitrary whitespace and leading HTML attributes. The "<>" at
+    the start and end of the HTML tag is also matched.
+    @param tag: the tag name
+    @ptype tag: string
+    @param attribute: the attribute name
+    @ptype attribute: string
+    @param value: the attribute value
+    @ptype value: string
+    @return: the generated regular expression suitable for re.compile()
+    @rtype: string
+    """
+    attrs = dict(
+        tag=case_insensitive_re(tag),
+        attribute=case_insensitive_re(attribute),
+        value=value,
+    )
+    return r'<\s*%(tag)s[^>]*\s+%(attribute)s\s*=\s*"%(value)s"[^>]*/?>' % attrs

-def fetchManyMatches(url, regexes):
-    '''Returns a list containing lists of matches for each regular expression, in the same order.'''
-    out.write('Matching regex(es) %r multiple times against %s...' % ([rex.pattern for rex in regexes], url), 2)
+
+def case_insensitive_re(name):
+    """Reformat the given name to a case insensitive regular expression string
+    without using re.IGNORECASE. This way selective strings can be made case
+    insensitive.
+    @param name: the name to make case insensitive
+    @ptype name: string
+    @return: the case insenstive regex
+    @rtype: string
+    """
+    return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
+
+
+baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
+
+def getPageContent(url):
+    # read page data
    page = urlopen(url)
-    data = page.read()
-
-    matches = [getMatchValues(regex.finditer(data)) for regex in regexes]
-    if matches:
-        out.write('...found %r' % (matches,), 2)
-    else:
-        out.write('...not found!', 2)
-
-    return list(matches)
-
-def fetchMatches(url, regexes):
-    out.write('Matching regex(es) %r against %s...' % ([rex.pattern for rex in regexes], url), 2)
-    page = urlopen(url)
-    data = page.read()
-
-    matches = []
-    for regex in regexes:
-        match = regex.search(data)
-        if match:
-            matches.append(match.group(1))
-
-    if matches:
-        out.write('...found %r' % (matches,), 2)
-    else:
-        out.write('...not found!', 2)
-
-    return matches
-
-def fetchMatch(url, regex):
-    matches = fetchMatches(url, (regex,))
-    if matches:
-        return matches[0]
-    return None
-
-def fetchUrl(url, regex):
-    match = fetchMatch(url, regex)
+    data = page.read(MAX_FILESIZE)
+    # determine base URL
+    baseUrl = None
+    match = baseSearch.search(data)
    if match:
-        return urlparse.urljoin(url, match)
+        baseUrl = match.group(1)
+    else:
+        baseUrl = url
+    return data, baseUrl
+
+
+def fetchUrl(url, searchRo):
+    data, baseUrl = getPageContent(url)
+    match = searchRo.search(data)
+    if match:
+        searchUrl = match.group(1)
+        out.write('matched URL %r' % searchUrl, 2)
+        return urlparse.urljoin(baseUrl, searchUrl)
    return None

-baseSearch = re.compile(r'<base\s+href="([^"]*)"\s+/?>', re.IGNORECASE)
-def fetchUrls(url, regexes):
-    matches = fetchMatches(url, [baseSearch] + list(regexes))
-    baseUrl = matches.pop(0) or url
-    return [urlparse.urljoin(baseUrl, match) for match in matches]

-def fetchManyUrls(url, regexes):
-    matchGroups = fetchManyMatches(url, [baseSearch] + list(regexes))
-    baseUrl = matchGroups.pop(0) or [url]
-    baseUrl = baseUrl[0]
+def fetchUrls(url, imageSearch, prevSearch=None):
+    data, baseUrl = getPageContent(url)
+    # match images
+    imageUrls = set()
+    for match in imageSearch.finditer(data):
+        imageUrl = match.group(1)
+        out.write('matched image URL %r' % imageUrl, 2)
+        imageUrls.add(urlparse.urljoin(baseUrl, imageUrl))
+    if not imageUrls:
+        raise ValueError("No images found at %s with pattern %s" % (url, imageSearch.pattern))
+    if prevSearch is not None:
+        # match previous URL
+        match = prevSearch.search(data)
+        if match:
+            prevUrl = match.group(1)
+            out.write('matched previous URL %r' % prevUrl, 2)
+            prevUrl = urlparse.urljoin(baseUrl, prevUrl)
+        else:
+            prevUrl = None
+        return imageUrls, prevUrl
+    return imageUrls

-    xformedGroups = []
-    for matchGroup in matchGroups:
-        xformedGroups.append([urlparse.urljoin(baseUrl, match) for match in matchGroup])
-
-    return xformedGroups

 def _unescape(text):
    """
@ -278,37 +296,3 @@ def strtimezone():
    else:
        zone = time.timezone
    return "%+04d" % (-zone//3600)
-
-
-def tagre(tag, attribute, value):
-    """Return a regular expression matching the given HTML tag, attribute
-    and value. It matches the tag and attribute names case insensitive,
-    and skips arbitrary whitespace and leading HTML attributes. The "<>" at
-    the start and end of the HTML tag is also matched.
-    @param tag: the tag name
-    @ptype tag: string
-    @param attribute: the attribute name
-    @ptype attribute: string
-    @param value: the attribute value
-    @ptype value: string
-    @return: the generated regular expression suitable for re.compile()
-    @rtype: string
-    """
-    attrs = dict(
-        tag=case_insensitive_re(tag),
-        attribute=case_insensitive_re(attribute),
-        value=value,
-    )
-    return r'<\s*%(tag)s[^>]*\s+%(attribute)s\s*=\s*"%(value)s"[^>]>' % attrs
-
-def case_insensitive_re(name):
-    """Reformat the given name to a case insensitive regular expression string
-    without using re.IGNORECASE. This way selective strings can be made case
-    insensitive.
-    @param name: the name to make case insensitive
-    @ptype name: string
-    @return: the case insenstive regex
-    @rtype: string
-    """
-    return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
-