From c707aa893d8f7a51d51d5053a4b5f4a422d2b81a Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Thu, 11 Oct 2012 12:03:12 +0200 Subject: [PATCH] A lot of refactoring. --- dosage | 290 +++++++++++++-------------------- dosagelib/comic.py | 63 ++++--- dosagelib/configuration.py | 8 +- dosagelib/helpers.py | 108 +----------- dosagelib/loader.py | 74 +++++++++ dosagelib/plugins/a.py | 6 +- dosagelib/plugins/b.py | 2 +- dosagelib/plugins/c.py | 17 +- dosagelib/plugins/d.py | 3 +- dosagelib/plugins/drunkduck.py | 3 +- dosagelib/plugins/e.py | 3 +- dosagelib/plugins/f.py | 5 +- dosagelib/plugins/g.py | 3 +- dosagelib/plugins/h.py | 2 +- dosagelib/plugins/i.py | 2 +- dosagelib/plugins/j.py | 3 +- dosagelib/plugins/k.py | 3 +- dosagelib/plugins/keenspot.py | 2 +- dosagelib/plugins/l.py | 4 +- dosagelib/plugins/m.py | 3 +- dosagelib/plugins/n.py | 3 +- dosagelib/plugins/num.py | 3 +- dosagelib/plugins/o.py | 3 +- dosagelib/plugins/p.py | 3 +- dosagelib/plugins/q.py | 3 +- dosagelib/plugins/r.py | 4 +- dosagelib/plugins/s.py | 3 +- dosagelib/plugins/t.py | 4 +- dosagelib/plugins/u.py | 3 +- dosagelib/plugins/uc.py | 5 +- dosagelib/plugins/v.py | 3 +- dosagelib/plugins/w.py | 3 +- dosagelib/plugins/x.py | 4 +- dosagelib/plugins/y.py | 2 +- dosagelib/plugins/z.py | 2 +- dosagelib/scraper.py | 203 ++++++++++++----------- dosagelib/util.py | 168 +++++++++---------- 37 files changed, 472 insertions(+), 551 deletions(-) create mode 100644 dosagelib/loader.py diff --git a/dosage b/dosage index 046b6fbad..cce4fa895 100755 --- a/dosage +++ b/dosage @@ -20,7 +20,6 @@ import sys import os import optparse -import traceback from dosagelib import events, scraper from dosagelib.output import out @@ -35,13 +34,13 @@ def setupOptions(): usage = 'usage: %prog [options] comicModule [comicModule ...]' parser = optparse.OptionParser(usage=usage) parser.add_option('-v', '--verbose', action='count', dest='verbose', default=0, help='provides verbose output, use multiple times for more verbosity') - parser.add_option('-c', '--catch-up', action='count', dest='catchup', default=None, help='traverse and retrieve all available comics up until the strip that already exists locally, use twice to retrieve until all strips exist locally') - parser.add_option('-b', '--base-path', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH') - parser.add_option('--base-url', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH') + parser.add_option('-c', '--catchup', action='count', dest='catchup', default=None, help='traverse and retrieve all available comics up until the strip that already exists locally, use twice to retrieve until all strips exist locally') + parser.add_option('-b', '--basepath', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH') + parser.add_option('--baseurl', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH') parser.add_option('-l', '--list', action='store_const', const=1, dest='list', help='list available comic modules') - parser.add_option('--single-list', action='store_const', const=2, dest='list', help='list available comic modules in a single list') + parser.add_option('--singlelist', action='store_const', const=2, dest='list', help='list available comic modules in a single list') parser.add_option('-V', '--version', action='store_true', dest='version', help='display the version number') - parser.add_option('-m', '--module-help', action='store_true', dest='modhelp', help='display help for comic modules') + parser.add_option('-m', '--modulehelp', action='store_true', dest='modhelp', help='display help for comic modules') parser.add_option('-t', '--timestamps', action='store_true', dest='timestamps', default=False, help='print timestamps for all output at any info level') parser.add_option('-o', '--output', action='store', dest='output', choices=events.getHandlers(), help='output formatting for downloaded comics') if is_tty(sys.stdout): @@ -54,196 +53,129 @@ def displayVersion(): print App print Copyright print Freeware + return 0 -class Dosage(object): - """Main program executing comic commands.""" +def setOutputInfo(options): + """Set global output level and timestamp option.""" + out.level = 0 + out.level += options.verbose + out.timestamps = options.timestamps - def __init__(self, settings): - """Store settings and initialize internal variables.""" - self.settings = settings - self.errors = 0 - def setOutputInfo(self): - """Set global output level and timestamp option.""" - out.level = 0 - out.level += self.settings['verbose'] - out.timestamps = self.settings['timestamps'] +def saveComicStrip(strip, basepath, progress): + """Save a comic strip which can consist of multiple images.""" + errors = 0 + for image in strip.getImages(): + try: + image.save(basepath, progress) + except IOError, msg: + out.write('Error saving %s: %s' % (image.filename, msg)) + errors += 1 + return errors - def saveComic(self, comic): - """Save one comic strip in an output file.""" - basepath = self.settings['basepath'] - progress = self.settings.get('progress', False) - fn, saved = comic.save(basepath, progress) - return saved - def saveComics(self, comics): - """Save a list of comics.""" - saved = False +def displayHelp(comics, basepath): + """Print help for comic strips.""" + for scraperobj in getScrapers(comics, basepath): + for line in scraperobj.getHelp().splitlines(): + out.write("Help: "+line) + return 0 + +def getComics(options, comics): + errors = 0 + events.installHandler(options.output, options.basepath, options.baseurl) + events.handler.start() + for scraperobj in getScrapers(comics, options.basepath): + out.context = scraperobj.get_name() + if options.catchup: + out.write('Catching up...') + strips = scraperobj.getAllStrips() + else: + out.write('Retrieving the current strip...') + strips = [scraperobj.getCurrentStrip()] + for strip in strips: + errors += saveComicStrip(strip, options.basepath, options.progress) + events.handler.end() + return errors + + +def run(options, comics): + """Execute comic commands.""" + setOutputInfo(options) + if options.version: + return displayVersion() + if options.list: + return doList(options.list == 1) + if len(comics) <= 0: + out.write('Warning: No comics specified, bailing out!') + return 1 + if options.modhelp: + return displayHelp(comics, options.basepath) + errors = getComics(options, comics) + + +def doList(columnList): + """List available comics.""" + out.write('Available comic scrapers:') + scrapers = getScrapers(['@@']) + if columnList: + doColumnList(scrapers) + else: + doSingleList(scrapers) + out.write('%d supported comics.' % len(scrapers)) + return 0 + + +def doSingleList(scrapers): + """Get list of scraper names, one per line.""" + print '\n'.join(scraperobj.get_name() for scraperobj in scrapers) + + +def doColumnList(scrapers): + """Get list of scraper names with multiple names per line.""" + screenWidth = get_columns() + names = [scraperobj.get_name() for scraperobj in scrapers] + maxlen = max([len(name) for name in names]) + namesPerLine = int(screenWidth / (maxlen + 1)) + while names: + print ''.join([name.ljust(maxlen) for name in names[:namesPerLine]]) + del names[:namesPerLine] + + +def getScrapers(comics, basepath=None): + """Get scraper objects for the given comics.""" + if '@' in comics: + # only scrapers whose directory already exists + if len(comics) > 1: + out.write("WARN: using '@' as comic name ignores all other specified comics.\n") + for scraperclass in scraper.get_scrapers(): + dirname = scraperclass.get_name().replace('/', os.sep) + if os.path.isdir(os.path.join(basepath, dirname)): + yield scraperclass() + elif '@@' in comics: + # all scrapers + if len(comics) > 1: + out.write("WARN: using '@@' as comic name ignores all other specified comics.\n") + for scraperclass in scraper.get_scrapers(): + yield scraperclass() + else: + # only selected for comic in comics: - saved = self.saveComic(comic) or saved - return saved - - def safeOp(self, fp, *args, **kwargs): - """Run a function and catch and report any errors.""" - try: - fp(*args, **kwargs) - except Exception: - self.errors += 1 - type, value, tb = sys.exc_info() - out.write('Traceback (most recent call last):', 1) - out.writelines(traceback.format_stack(), 1) - out.writelines(traceback.format_tb(tb)[1:], 1) - out.writelines(traceback.format_exception_only(type, value)) - - def getCurrent(self): - """Retrieve and save all current comic strips.""" - out.write('Retrieving the current strip...') - self.saveComics(self.module.getCurrentComics()) - - def getIndex(self, index): - """Retrieve comcis with given index.""" - out.write('Retrieving index "%s"....' % (index,)) - try: - self.module.setStrip(index) - self.saveComics(self.module.getNextComics()) - except NotImplementedError: - out.write('No indexed retrieval support.') - - def catchup(self): - """Save all comics until the current date.""" - out.write('Catching up...') - for comics in self.module: - if not self.saveComics(comics) and self.settings['catchup'] < 2: - break - - def catchupIndex(self, index): - """Retrieve and save all comics from the given index.""" - out.write('Catching up from index "%s"...' % (index,)) - self.module.setStrip(index) - for comics in self.module: - if not self.saveComics(comics) and self.settings['catchup'] < 2: - break - - def getScrapers(self): - """Get list of scraper objects.""" - return scraper.items() - - def getExistingComics(self): - """Get all existing comic scrapers.""" - for scraper in self.getScrapers(): - dirname = scraper.get_name().replace('/', os.sep) - if os.path.isdir(os.path.join(self.settings['basepath'], dirname)): - yield scraper - - def doList(self, columnList): - """List available comics.""" - out.write('Available comic scrapers:') - scrapers = self.getScrapers() - if len(scrapers) > 0: - if columnList: - self.doColumnList(scrapers) - else: - self.doSingleList(scrapers) - out.write('%d supported comics.' % len(scrapers)) - - def doSingleList(self, scrapers): - """Get list of scraper names, one per line.""" - print '\n'.join(scraper.get_name() for scraper in scrapers) - - def doColumnList(self, scrapers): - """Get list of scraper names with multiple names per line.""" - screenWidth = get_columns() - names = [scraper.get_name() for scraper in scrapers] - maxlen = max([len(name) for name in names]) - namesPerLine = int(screenWidth / (maxlen + 1)) - while names: - print ''.join([name.ljust(maxlen) for name in names[:namesPerLine]]) - del names[:namesPerLine] - - def doCatchup(self): - """Catchup comics.""" - for comic in self.useComics(): - if self.indices: - self.safeOp(self.catchupIndex, self.indices[0]) - else: - self.safeOp(self.catchup) - - def doCurrent(self): - """Get current comics.""" - for comic in self.useComics(): - if self.indices: - for index in self.indices: - self.safeOp(self.getIndex, index) - else: - self.safeOp(self.getCurrent) - - def doHelp(self): - """Print help for comic strips.""" - for scraper in self.useComics(): - for line in scraper.getHelp().splitlines(): - out.write("Help: "+line) - - def setupComic(self, scraper): - """Setup the internal comic module from given scraper.""" - self.module = scraper() - out.context = scraper.get_name() - return self.module - - def useComics(self): - """Set all comic modules for the defined comics.""" - for comic in self.comics: c = comic.split(':', 2) if len(c) > 1: - self.indices = c[1].split(',') + indices = c[1].split(',') else: - self.indices = None - + indices = None moduleName = c[0] - if moduleName == '@': - for s in self.getExistingComics(): - yield self.setupComic(s) - elif moduleName == '@@': - for s in self.getScrapers(): - yield self.setupComic(s) - else: - yield self.setupComic(scraper.get(moduleName)) - - def run(self, comics): - """Execute comic commands.""" - self.setOutputInfo() - self.comics = comics - - om = self.settings['output'] - events.installHandler(om, self.settings['basepath'], self.settings['baseurl']) - events.handler.start() - - if self.settings['version']: - displayVersion() - elif self.settings['list']: - self.doList(self.settings['list'] == 1) - elif len(comics) <= 0: - out.write('Warning: No comics specified, bailing out!') - elif self.settings['modhelp']: - self.doHelp() - elif self.settings['catchup']: - self.doCatchup() - else: - self.doCurrent() - - events.handler.end() + yield scraper.get_scraper(moduleName)(indices=indices) def main(): """Parse options and execute commands.""" try: parser = setupOptions() options, args = parser.parse_args() - d = Dosage(options.__dict__) - d.run(args) - if d.errors: - res = 1 - else: - res = 0 + res = run(options, args) except KeyboardInterrupt: print "Aborted." res = 1 diff --git a/dosagelib/comic.py b/dosagelib/comic.py index d289e73b7..fe4063912 100644 --- a/dosagelib/comic.py +++ b/dosagelib/comic.py @@ -6,8 +6,6 @@ import locale import rfc822 import time import shutil -# XXX why is this done?? -locale.setlocale(locale.LC_ALL, '') from .output import out from .util import urlopen, saneDataSize, normaliseURL @@ -18,16 +16,34 @@ class FetchComicError(IOError): """Exception for comic fetching errors.""" pass -class Comic(object): - """Download and save a single comic.""" +class ComicStrip(object): + """A list of comic image URLs.""" - def __init__(self, moduleName, url, referrer=None, filename=None): + def __init__(self, name, parentUrl, imageUrls, namer): + """Store the image URL list.""" + self.name = name + self.parentUrl = parentUrl + self.imageUrls = imageUrls + self.namer = namer + + def getImages(self): + """Get a list of image downloaders.""" + for imageUrl in self.imageUrls: + yield self.getDownloader(normaliseURL(imageUrl)) + + def getDownloader(self, url): + filename = self.namer(url, self.parentUrl) + return ComicImage(self.name, self.parentUrl, url, filename) + + +class ComicImage(object): + def __init__(self, name, referrer, url, filename): """Set URL and filename.""" - self.moduleName = moduleName - self.url = normaliseURL(url) + self.name = name self.referrer = referrer + self.url = url if filename is None: - filename = url.split('/')[-1] + filename = url.rsplit('/')[1] self.filename, self.ext = os.path.splitext(filename) self.filename = self.filename.replace(os.sep, '_') self.ext = self.ext.replace(os.sep, '_') @@ -62,13 +78,13 @@ class Comic(object): def save(self, basepath, showProgress=False): """Save comic URL to filename on disk.""" self.connect() - comicName, comicExt = self.filename, self.ext + filename = "%s%s" % (self.filename, self.ext) comicSize = self.contentLength - comicDir = os.path.join(basepath, self.moduleName.replace('/', os.sep)) + comicDir = os.path.join(basepath, self.name.replace('/', os.sep)) if not os.path.isdir(comicDir): os.makedirs(comicDir) - fn = os.path.join(comicDir, '%s%s' % (self.filename, self.ext)) + fn = os.path.join(comicDir, filename) if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize: self.urlobj.close() self.touch(fn) @@ -76,10 +92,8 @@ class Comic(object): return fn, False try: - tmpFn = os.path.join(comicDir, '__%s%s' % (self.filename, self.ext)) - out.write('Writing comic to temporary file %s...' % (tmpFn,), 3) - comicOut = file(tmpFn, 'wb') - try: + out.write('Writing comic to file %s...' % (fn,), 3) + with open(fn, 'wb') as comicOut: startTime = time.time() if showProgress: def pollData(): @@ -92,12 +106,12 @@ class Comic(object): else: comicOut.write(self.urlobj.read()) endTime = time.time() - finally: - comicOut.close() - out.write('Copying temporary file (%s) to %s...' % (tmpFn, fn), 3) - shutil.copy2(tmpFn, fn) self.touch(fn) - + except: + if os.path.isfile(fn): + os.remove(fn) + raise + else: size = os.path.getsize(fn) bytes = locale.format('%d', size, True) if endTime != startTime: @@ -106,13 +120,8 @@ class Comic(object): speed = '???' attrs = dict(fn=fn, bytes=bytes, speed=speed) out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1) - handler.comicDownloaded(self.moduleName, fn) - self.urlobj.close() + handler.comicDownloaded(self.name, fn) finally: - try: - out.write('Removing temporary file %s...' % (tmpFn,), 3) - os.remove(tmpFn) - except: - pass + self.urlobj.close() return fn, True diff --git a/dosagelib/configuration.py b/dosagelib/configuration.py index ac8815dab..24325dd09 100644 --- a/dosagelib/configuration.py +++ b/dosagelib/configuration.py @@ -9,11 +9,13 @@ AppName = configdata.name App = AppName+u" "+Version Author = configdata.author HtmlAuthor = Author.replace(u' ', u' ') -Copyright = u"Copyright (C) 2004-2008 "+Author -HtmlCopyright = u"Copyright © 2004-2008 "+HtmlAuthor +Maintainer = configdata.maintainer +HtmlMaintainer = Maintainer.replace(u' ', u' ') +Copyright = u"Copyright (C) 2004-2008 "+Author+u", (C) 2012 "+Maintainer +HtmlCopyright = u"Copyright © 2004-2008 "+HtmlAuthor+u", 2012 "+HtmlMaintainer Url = configdata.url SupportUrl = Url + u"/issues" -Email = configdata.author_email +Email = configdata.maintainer_email UserAgent = u"Mozilla/5.0 (compatible; %s/%s; +%s)" % (AppName, Version, Url) Freeware = AppName+u""" comes with ABSOLUTELY NO WARRANTY! This is free software, and you are welcome to redistribute it diff --git a/dosagelib/helpers.py b/dosagelib/helpers.py index eb4be5dab..995546dc3 100644 --- a/dosagelib/helpers.py +++ b/dosagelib/helpers.py @@ -2,112 +2,10 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012 Bastian Kleineidam import re +import urlparse -from .util import fetchUrl, fetchManyUrls, getQueryParams -from .comic import Comic - -class _BasicScraper(object): - '''Base class with scrape functions for comics. - - @type latestUrl: C{string} - @cvar latestUrl: The URL for the latest comic strip. - @type imageUrl: C{string} - @cvar imageUrl: A string that is interpolated with the strip index - to yield the URL for a particular strip. - @type imageSearch: C{regex} - @cvar imageSearch: A compiled regex that will locate the strip image URL - when applied to the strip page. - @type prevSearch: C{regex} - @cvar prevSearch: A compiled regex that will locate the URL for the - previous strip when applied to a strip page. - ''' - referrer = None - help = 'Sorry, no help for this comic yet.' - - def __init__(self): - """Initialize internal variables.""" - self.currentUrl = None - self.urls = set() - - def getReferrer(self, imageUrl, pageUrl): - """Return referrer for HTTP connection.""" - return self.referrer or pageUrl or self.getLatestUrl() - - def getComic(self, url, pageUrl): - """Get comic downloader for given URL and page.""" - if not url: - return None - return Comic(self.get_name(), url, filename=self.getFilename(url, pageUrl), referrer=self.getReferrer(url, pageUrl)) - - def getCurrentComics(self): - """Get list of current comics.""" - self.currentUrl = self.getLatestUrl() - comics = self.getNextComics() - if not comics: - raise ValueError("Could not find current comic.") - return comics - - def getNextComics(self): - """Get all next comics.""" - comics = [] - while not comics and self.currentUrl and self.currentUrl not in self.urls: - comicUrlGroups, prevUrl = fetchManyUrls(self.currentUrl, [self.imageSearch, self.prevSearch]) - - if prevUrl: - prevUrl = prevUrl[0] - else: - prevUrl = None - - for comicUrl in comicUrlGroups: - comics.append(self.getComic(comicUrl, self.currentUrl)) - - self.urls.update([self.currentUrl]) - self.currentUrl = (prevUrl, None)[prevUrl in self.urls] - return comics - - def setStrip(self, index): - """Set current comic strip URL.""" - self.currentUrl = self.imageUrl % index - - def getHelp(self): - """Return help text for this scraper.""" - return self.help - - def __iter__(self): - """Iterate through the strips, starting from the current one and going backward.""" - if not self.currentUrl: - self.currentUrl = self.getLatestUrl() - comics = True - while comics: - comics = self.getNextComics() - if comics: - yield comics - - @classmethod - def get_name(cls): - """Get scraper name.""" - if hasattr(cls, 'name'): - return cls.name - return cls.__name__ - - @classmethod - def starter(cls): - """Get starter URL from where to scrape comic strips.""" - return cls.latestUrl - - @classmethod - def namer(cls, imageUrl, pageUrl): - """Return filename for given image and page URL.""" - return None - - def getFilename(self, imageUrl, pageUrl): - """Return filename for given image and page URL.""" - return self.namer(imageUrl, pageUrl) - - def getLatestUrl(self): - """Get starter URL from where to scrape comic strips.""" - return self.starter() - +from .util import fetchUrl, getQueryParams +from .scraper import _BasicScraper def queryNamer(paramName, usePageUrl=False): """Get name from URL query part.""" diff --git a/dosagelib/loader.py b/dosagelib/loader.py new file mode 100644 index 000000000..29936d624 --- /dev/null +++ b/dosagelib/loader.py @@ -0,0 +1,74 @@ +# -*- coding: iso-8859-1 -*- + +import os +import sys + +def get_modules(folder, importprefix): + """Find all valid modules in the plugins directory. A valid module + must have a .py extension, and is importable. + @return: all loaded valid modules + @rtype: iterator of module + """ + for filename in get_importable_modules(folder): + try: + module = load_module(filename, importprefix) + if module is not None: + yield module + except StandardError, msg: + print "ERROR: could not load module %s: %s" % (filename, msg) + + +def get_importable_modules(folder): + """Find all module files in the given folder that end witn '.py' and + don't start with an underscore. + @return module filenames + @rtype: iterator of string + """ + for fname in os.listdir(folder): + if fname.endswith('.py') and not fname.startswith('_'): + yield os.path.join(folder, fname) + + +def load_module(filename, importprefix): + """Load and return the module given by the filename. + Other exceptions than ImportError are not catched. + @return: loaded module or None on import errors + @rtype: module or None + """ + name = os.path.splitext(os.path.basename(filename))[0] + modulename = "%s%s" % (importprefix, name) + __import__(modulename) + return sys.modules[modulename] + + +def get_plugins(modules, classobj): + """Find all scrapers in all modules. + @param modules: the modules to search + @ptype modules: iterator of modules + @return: found scrapers + @rytpe: iterator of class objects + """ + for module in modules: + for plugin in get_module_plugins(module, classobj): + yield plugin + + +def get_module_plugins(module, classobj): + """Return all subclasses of _BasicScraper in the module. + If the module defines __all__, only those entries will be searched, + otherwise all objects not starting with '_' will be searched. + """ + try: + names = module.__all__ + except AttributeError: + names = [x for x in vars(module) if not x.startswith('_')] + for name in names: + try: + obj = getattr(module, name) + except AttributeError: + continue + try: + if issubclass(obj, classobj): + yield obj + except TypeError: + continue diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py index 824eb086d..ef4f45999 100644 --- a/dosagelib/plugins/a.py +++ b/dosagelib/plugins/a.py @@ -2,14 +2,14 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, MULTILINE from ..util import tagre - -from ..helpers import _BasicScraper, regexNamer, bounceStarter, indirectStarter +from ..scraper import _BasicScraper +from ..helpers import regexNamer, bounceStarter, indirectStarter class ALessonIsLearned(_BasicScraper): latestUrl = 'http://www.alessonislearned.com/' imageUrl = 'http://www.alessonislearned.com/lesson%s.html' - imageSearch = compile(tagre("img", "src", r"(cmx/.+?)")) + imageSearch = compile(tagre("img", "src", r"(cmx/lesson.+?)")) prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=.+?)")+r".+?previous") help = 'Index format: nnn' diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py index 0b9486bc6..04f75a28c 100644 --- a/dosagelib/plugins/b.py +++ b/dosagelib/plugins/b.py @@ -2,7 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile -from ..helpers import _BasicScraper +from ..scraper import _BasicScraper class BadlyDrawnKitties(_BasicScraper): diff --git a/dosagelib/plugins/c.py b/dosagelib/plugins/c.py index ed82c6749..625362bef 100644 --- a/dosagelib/plugins/c.py +++ b/dosagelib/plugins/c.py @@ -2,18 +2,23 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile -from ..helpers import ( - _BasicScraper, constStarter, bounceStarter, indirectStarter) -from ..util import getQueryParams +from ..scraper import _BasicScraper +from ..helpers import constStarter, bounceStarter, indirectStarter +from ..util import tagre, getQueryParams class CalvinAndHobbes(_BasicScraper): - latestUrl = 'http://www.gocomics.com/calvinandhobbes/' + starter = bounceStarter('http://www.gocomics.com/calvinandhobbes/', + compile(tagre("a", "href", "(/calvinandhobbes/\d+/\d+/\d+)")+"Next feature")) imageUrl = 'http://www.gocomics.com/calvinandhobbes/%s' - imageSearch = compile(r'src="(http://picayune\.uclick\.com/comics/ch/[^"]+\.gif)"') - prevSearch = compile(r'href="(.*?)"\s+onclick="[^"]*">Previous day') + imageSearch = compile(tagre("img", "src", "(http://assets\.amuniversal\.com/[a-f0-9]+)")) + prevSearch = compile(tagre("a", "href", "(/calvinandhobbes/\d+/\d+/\d+)")+"Previous feature") help = 'Index format: yyyy/mm/dd' + @classmethod + def namer(cls, imageUrl, pageUrl): + prefix, year, month, day = pageUrl.rsplit('/', 3) + return "%s%s%s.gif" % (year, month, day) class CandyCartoon(_BasicScraper): diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py index ddadcab2b..9f3ec72c4 100644 --- a/dosagelib/plugins/d.py +++ b/dosagelib/plugins/d.py @@ -2,7 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE, MULTILINE -from ..helpers import _BasicScraper, bounceStarter, indirectStarter +from ..scraper import _BasicScraper +from ..helpers import bounceStarter, indirectStarter from ..util import getQueryParams diff --git a/dosagelib/plugins/drunkduck.py b/dosagelib/plugins/drunkduck.py index 402c08d32..ea1b7e080 100644 --- a/dosagelib/plugins/drunkduck.py +++ b/dosagelib/plugins/drunkduck.py @@ -2,7 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper, bounceStarter, queryNamer +from ..scraper import _BasicScraper +from ..helpers import bounceStarter, queryNamer def drunkDuck(shortName): diff --git a/dosagelib/plugins/e.py b/dosagelib/plugins/e.py index bc7eea357..3597b8148 100644 --- a/dosagelib/plugins/e.py +++ b/dosagelib/plugins/e.py @@ -2,7 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper, indirectStarter +from ..helpers import indirectStarter +from ..scraper import _BasicScraper class EerieCuties(_BasicScraper): diff --git a/dosagelib/plugins/f.py b/dosagelib/plugins/f.py index f66d3b1fb..9f7cf6e2f 100644 --- a/dosagelib/plugins/f.py +++ b/dosagelib/plugins/f.py @@ -1,9 +1,10 @@ # -*- coding: iso-8859-1 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE, MULTILINE -from ..util import tagre -from ..helpers import _BasicScraper, indirectStarter +from ..util import tagre +from ..scraper import _BasicScraper +from ..helpers import indirectStarter class FalconTwin(_BasicScraper): diff --git a/dosagelib/plugins/g.py b/dosagelib/plugins/g.py index c5b2b140c..fcbc13f9f 100644 --- a/dosagelib/plugins/g.py +++ b/dosagelib/plugins/g.py @@ -2,7 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile -from ..helpers import _BasicScraper, indirectStarter +from ..scraper import _BasicScraper +from ..helpers import indirectStarter class Galaxion(_BasicScraper): diff --git a/dosagelib/plugins/h.py b/dosagelib/plugins/h.py index 3e34fd0ea..228ca85ea 100644 --- a/dosagelib/plugins/h.py +++ b/dosagelib/plugins/h.py @@ -2,7 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile -from ..helpers import _BasicScraper +from ..scraper import _BasicScraper class HappyMedium(_BasicScraper): diff --git a/dosagelib/plugins/i.py b/dosagelib/plugins/i.py index b5ed19056..3ac32c160 100644 --- a/dosagelib/plugins/i.py +++ b/dosagelib/plugins/i.py @@ -2,7 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper +from ..scraper import _BasicScraper class IDreamOfAJeanieBottle(_BasicScraper): diff --git a/dosagelib/plugins/j.py b/dosagelib/plugins/j.py index adbe635dd..b241e6fca 100644 --- a/dosagelib/plugins/j.py +++ b/dosagelib/plugins/j.py @@ -2,8 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, MULTILINE -from ..helpers import _BasicScraper - +from ..scraper import _BasicScraper class Jack(_BasicScraper): diff --git a/dosagelib/plugins/k.py b/dosagelib/plugins/k.py index 66027e024..9572532df 100644 --- a/dosagelib/plugins/k.py +++ b/dosagelib/plugins/k.py @@ -2,8 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper - +from ..scraper import _BasicScraper class KernelPanic(_BasicScraper): diff --git a/dosagelib/plugins/keenspot.py b/dosagelib/plugins/keenspot.py index bbf5aa697..6f1ccee53 100644 --- a/dosagelib/plugins/keenspot.py +++ b/dosagelib/plugins/keenspot.py @@ -2,7 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper +from ..scraper import _BasicScraper def keenSpot(comics): diff --git a/dosagelib/plugins/l.py b/dosagelib/plugins/l.py index f662b38d6..a91a14aa2 100644 --- a/dosagelib/plugins/l.py +++ b/dosagelib/plugins/l.py @@ -2,8 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile -from ..helpers import _BasicScraper, indirectStarter - +from ..scraper import _BasicScraper +from ..helpers import indirectStarter class LasLindas(_BasicScraper): diff --git a/dosagelib/plugins/m.py b/dosagelib/plugins/m.py index 5849db562..03e80a794 100644 --- a/dosagelib/plugins/m.py +++ b/dosagelib/plugins/m.py @@ -2,7 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper, queryNamer +from ..scraper import _BasicScraper +from ..helpers import queryNamer class MadamAndEve(_BasicScraper): diff --git a/dosagelib/plugins/n.py b/dosagelib/plugins/n.py index 66ac9b95f..78c2e9a75 100644 --- a/dosagelib/plugins/n.py +++ b/dosagelib/plugins/n.py @@ -2,7 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper, indirectStarter, _PHPScraper +from ..scraper import _BasicScraper +from ..helpers import indirectStarter, _PHPScraper diff --git a/dosagelib/plugins/num.py b/dosagelib/plugins/num.py index b20c4f9d4..77ef10d8b 100644 --- a/dosagelib/plugins/num.py +++ b/dosagelib/plugins/num.py @@ -2,8 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile -from ..helpers import _BasicScraper - +from ..scraper import _BasicScraper class NineteenNinetySeven(_BasicScraper): diff --git a/dosagelib/plugins/o.py b/dosagelib/plugins/o.py index 726086bdb..56d8f261a 100644 --- a/dosagelib/plugins/o.py +++ b/dosagelib/plugins/o.py @@ -2,7 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper, indirectStarter +from ..scraper import _BasicScraper +from ..helpers import indirectStarter class OctopusPie(_BasicScraper): diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index 496d2926c..73253af9d 100644 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -2,7 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper, bounceStarter, queryNamer +from ..scraper import _BasicScraper +from ..helpers import bounceStarter, queryNamer class PartiallyClips(_BasicScraper): diff --git a/dosagelib/plugins/q.py b/dosagelib/plugins/q.py index c721317b7..02739a82f 100644 --- a/dosagelib/plugins/q.py +++ b/dosagelib/plugins/q.py @@ -2,8 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile -from ..helpers import _BasicScraper - +from ..scraper import _BasicScraper class QuestionableContent(_BasicScraper): diff --git a/dosagelib/plugins/r.py b/dosagelib/plugins/r.py index 58016469f..c88466335 100644 --- a/dosagelib/plugins/r.py +++ b/dosagelib/plugins/r.py @@ -2,8 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile -from ..helpers import _BasicScraper, bounceStarter - +from ..scraper import _BasicScraper +from ..helpers import bounceStarter class RadioactivePanda(_BasicScraper): diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index eb56bb114..50b795279 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -3,7 +3,8 @@ from re import compile, MULTILINE, IGNORECASE, sub from os.path import splitext -from ..helpers import _BasicScraper, bounceStarter, indirectStarter +from ..scraper import _BasicScraper +from ..helpers import bounceStarter, indirectStarter class SailorsunOrg(_BasicScraper): diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py index c9b8559d4..7e93fbf8f 100644 --- a/dosagelib/plugins/t.py +++ b/dosagelib/plugins/t.py @@ -2,8 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper, indirectStarter - +from ..scraper import _BasicScraper +from ..helpers import indirectStarter class TalesOfPylea(_BasicScraper): diff --git a/dosagelib/plugins/u.py b/dosagelib/plugins/u.py index 164104d38..d3c115a71 100644 --- a/dosagelib/plugins/u.py +++ b/dosagelib/plugins/u.py @@ -2,7 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE -from ..helpers import _BasicScraper, bounceStarter, indirectStarter +from ..scraper import _BasicScraper +from ..helpers import bounceStarter, indirectStarter from ..util import getQueryParams diff --git a/dosagelib/plugins/uc.py b/dosagelib/plugins/uc.py index 78e88e350..72a245557 100644 --- a/dosagelib/plugins/uc.py +++ b/dosagelib/plugins/uc.py @@ -2,8 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE, sub -from ..helpers import _BasicScraper -from ..util import fetchManyMatches, fetchUrl +from ..scraper import _BasicScraper +from ..util import fetchUrl class _UClickScraper(_BasicScraper): @@ -24,6 +24,7 @@ class _UClickScraper(_BasicScraper): 'index', ) + # XXX refactor this mess submoduleSearch = compile(r'([^>]+?)', IGNORECASE) partsMatch = compile(r'([^>]+?)', IGNORECASE) matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0] diff --git a/dosagelib/plugins/v.py b/dosagelib/plugins/v.py index eae02598b..109db8041 100644 --- a/dosagelib/plugins/v.py +++ b/dosagelib/plugins/v.py @@ -2,8 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE, MULTILINE -from ..helpers import _BasicScraper - +from ..scraper import _BasicScraper class _VGCats(_BasicScraper): diff --git a/dosagelib/plugins/w.py b/dosagelib/plugins/w.py index 6ac858607..0aaedd9af 100644 --- a/dosagelib/plugins/w.py +++ b/dosagelib/plugins/w.py @@ -2,7 +2,8 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile, IGNORECASE, DOTALL -from ..helpers import _BasicScraper, queryNamer, bounceStarter +from ..scraper import _BasicScraper +from ..helpers import queryNamer, bounceStarter class WayfarersMoon(_BasicScraper): diff --git a/dosagelib/plugins/x.py b/dosagelib/plugins/x.py index d9861822c..6ba59215f 100644 --- a/dosagelib/plugins/x.py +++ b/dosagelib/plugins/x.py @@ -2,7 +2,9 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs from re import compile -from ..helpers import _BasicScraper, bounceStarter +from ..scraper import _BasicScraper +from ..helpers import bounceStarter + class xkcd(_BasicScraper): starter = bounceStarter('http://xkcd.com/', compile(r'