diff --git a/dosage b/dosage
index 046b6fbad..cce4fa895 100755
--- a/dosage
+++ b/dosage
@@ -20,7 +20,6 @@
import sys
import os
import optparse
-import traceback
from dosagelib import events, scraper
from dosagelib.output import out
@@ -35,13 +34,13 @@ def setupOptions():
usage = 'usage: %prog [options] comicModule [comicModule ...]'
parser = optparse.OptionParser(usage=usage)
parser.add_option('-v', '--verbose', action='count', dest='verbose', default=0, help='provides verbose output, use multiple times for more verbosity')
- parser.add_option('-c', '--catch-up', action='count', dest='catchup', default=None, help='traverse and retrieve all available comics up until the strip that already exists locally, use twice to retrieve until all strips exist locally')
- parser.add_option('-b', '--base-path', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH')
- parser.add_option('--base-url', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH')
+ parser.add_option('-c', '--catchup', action='count', dest='catchup', default=None, help='traverse and retrieve all available comics up until the strip that already exists locally, use twice to retrieve until all strips exist locally')
+ parser.add_option('-b', '--basepath', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH')
+ parser.add_option('--baseurl', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH')
parser.add_option('-l', '--list', action='store_const', const=1, dest='list', help='list available comic modules')
- parser.add_option('--single-list', action='store_const', const=2, dest='list', help='list available comic modules in a single list')
+ parser.add_option('--singlelist', action='store_const', const=2, dest='list', help='list available comic modules in a single list')
parser.add_option('-V', '--version', action='store_true', dest='version', help='display the version number')
- parser.add_option('-m', '--module-help', action='store_true', dest='modhelp', help='display help for comic modules')
+ parser.add_option('-m', '--modulehelp', action='store_true', dest='modhelp', help='display help for comic modules')
parser.add_option('-t', '--timestamps', action='store_true', dest='timestamps', default=False, help='print timestamps for all output at any info level')
parser.add_option('-o', '--output', action='store', dest='output', choices=events.getHandlers(), help='output formatting for downloaded comics')
if is_tty(sys.stdout):
@@ -54,196 +53,129 @@ def displayVersion():
print App
print Copyright
print Freeware
+ return 0
-class Dosage(object):
- """Main program executing comic commands."""
+def setOutputInfo(options):
+ """Set global output level and timestamp option."""
+ out.level = 0
+ out.level += options.verbose
+ out.timestamps = options.timestamps
- def __init__(self, settings):
- """Store settings and initialize internal variables."""
- self.settings = settings
- self.errors = 0
- def setOutputInfo(self):
- """Set global output level and timestamp option."""
- out.level = 0
- out.level += self.settings['verbose']
- out.timestamps = self.settings['timestamps']
+def saveComicStrip(strip, basepath, progress):
+ """Save a comic strip which can consist of multiple images."""
+ errors = 0
+ for image in strip.getImages():
+ try:
+ image.save(basepath, progress)
+ except IOError, msg:
+ out.write('Error saving %s: %s' % (image.filename, msg))
+ errors += 1
+ return errors
- def saveComic(self, comic):
- """Save one comic strip in an output file."""
- basepath = self.settings['basepath']
- progress = self.settings.get('progress', False)
- fn, saved = comic.save(basepath, progress)
- return saved
- def saveComics(self, comics):
- """Save a list of comics."""
- saved = False
+def displayHelp(comics, basepath):
+ """Print help for comic strips."""
+ for scraperobj in getScrapers(comics, basepath):
+ for line in scraperobj.getHelp().splitlines():
+ out.write("Help: "+line)
+ return 0
+
+def getComics(options, comics):
+ errors = 0
+ events.installHandler(options.output, options.basepath, options.baseurl)
+ events.handler.start()
+ for scraperobj in getScrapers(comics, options.basepath):
+ out.context = scraperobj.get_name()
+ if options.catchup:
+ out.write('Catching up...')
+ strips = scraperobj.getAllStrips()
+ else:
+ out.write('Retrieving the current strip...')
+ strips = [scraperobj.getCurrentStrip()]
+ for strip in strips:
+ errors += saveComicStrip(strip, options.basepath, options.progress)
+ events.handler.end()
+ return errors
+
+
+def run(options, comics):
+ """Execute comic commands."""
+ setOutputInfo(options)
+ if options.version:
+ return displayVersion()
+ if options.list:
+ return doList(options.list == 1)
+ if len(comics) <= 0:
+ out.write('Warning: No comics specified, bailing out!')
+ return 1
+ if options.modhelp:
+ return displayHelp(comics, options.basepath)
+ errors = getComics(options, comics)
+
+
+def doList(columnList):
+ """List available comics."""
+ out.write('Available comic scrapers:')
+ scrapers = getScrapers(['@@'])
+ if columnList:
+ doColumnList(scrapers)
+ else:
+ doSingleList(scrapers)
+ out.write('%d supported comics.' % len(scrapers))
+ return 0
+
+
+def doSingleList(scrapers):
+ """Get list of scraper names, one per line."""
+ print '\n'.join(scraperobj.get_name() for scraperobj in scrapers)
+
+
+def doColumnList(scrapers):
+ """Get list of scraper names with multiple names per line."""
+ screenWidth = get_columns()
+ names = [scraperobj.get_name() for scraperobj in scrapers]
+ maxlen = max([len(name) for name in names])
+ namesPerLine = int(screenWidth / (maxlen + 1))
+ while names:
+ print ''.join([name.ljust(maxlen) for name in names[:namesPerLine]])
+ del names[:namesPerLine]
+
+
+def getScrapers(comics, basepath=None):
+ """Get scraper objects for the given comics."""
+ if '@' in comics:
+ # only scrapers whose directory already exists
+ if len(comics) > 1:
+ out.write("WARN: using '@' as comic name ignores all other specified comics.\n")
+ for scraperclass in scraper.get_scrapers():
+ dirname = scraperclass.get_name().replace('/', os.sep)
+ if os.path.isdir(os.path.join(basepath, dirname)):
+ yield scraperclass()
+ elif '@@' in comics:
+ # all scrapers
+ if len(comics) > 1:
+ out.write("WARN: using '@@' as comic name ignores all other specified comics.\n")
+ for scraperclass in scraper.get_scrapers():
+ yield scraperclass()
+ else:
+ # only selected
for comic in comics:
- saved = self.saveComic(comic) or saved
- return saved
-
- def safeOp(self, fp, *args, **kwargs):
- """Run a function and catch and report any errors."""
- try:
- fp(*args, **kwargs)
- except Exception:
- self.errors += 1
- type, value, tb = sys.exc_info()
- out.write('Traceback (most recent call last):', 1)
- out.writelines(traceback.format_stack(), 1)
- out.writelines(traceback.format_tb(tb)[1:], 1)
- out.writelines(traceback.format_exception_only(type, value))
-
- def getCurrent(self):
- """Retrieve and save all current comic strips."""
- out.write('Retrieving the current strip...')
- self.saveComics(self.module.getCurrentComics())
-
- def getIndex(self, index):
- """Retrieve comcis with given index."""
- out.write('Retrieving index "%s"....' % (index,))
- try:
- self.module.setStrip(index)
- self.saveComics(self.module.getNextComics())
- except NotImplementedError:
- out.write('No indexed retrieval support.')
-
- def catchup(self):
- """Save all comics until the current date."""
- out.write('Catching up...')
- for comics in self.module:
- if not self.saveComics(comics) and self.settings['catchup'] < 2:
- break
-
- def catchupIndex(self, index):
- """Retrieve and save all comics from the given index."""
- out.write('Catching up from index "%s"...' % (index,))
- self.module.setStrip(index)
- for comics in self.module:
- if not self.saveComics(comics) and self.settings['catchup'] < 2:
- break
-
- def getScrapers(self):
- """Get list of scraper objects."""
- return scraper.items()
-
- def getExistingComics(self):
- """Get all existing comic scrapers."""
- for scraper in self.getScrapers():
- dirname = scraper.get_name().replace('/', os.sep)
- if os.path.isdir(os.path.join(self.settings['basepath'], dirname)):
- yield scraper
-
- def doList(self, columnList):
- """List available comics."""
- out.write('Available comic scrapers:')
- scrapers = self.getScrapers()
- if len(scrapers) > 0:
- if columnList:
- self.doColumnList(scrapers)
- else:
- self.doSingleList(scrapers)
- out.write('%d supported comics.' % len(scrapers))
-
- def doSingleList(self, scrapers):
- """Get list of scraper names, one per line."""
- print '\n'.join(scraper.get_name() for scraper in scrapers)
-
- def doColumnList(self, scrapers):
- """Get list of scraper names with multiple names per line."""
- screenWidth = get_columns()
- names = [scraper.get_name() for scraper in scrapers]
- maxlen = max([len(name) for name in names])
- namesPerLine = int(screenWidth / (maxlen + 1))
- while names:
- print ''.join([name.ljust(maxlen) for name in names[:namesPerLine]])
- del names[:namesPerLine]
-
- def doCatchup(self):
- """Catchup comics."""
- for comic in self.useComics():
- if self.indices:
- self.safeOp(self.catchupIndex, self.indices[0])
- else:
- self.safeOp(self.catchup)
-
- def doCurrent(self):
- """Get current comics."""
- for comic in self.useComics():
- if self.indices:
- for index in self.indices:
- self.safeOp(self.getIndex, index)
- else:
- self.safeOp(self.getCurrent)
-
- def doHelp(self):
- """Print help for comic strips."""
- for scraper in self.useComics():
- for line in scraper.getHelp().splitlines():
- out.write("Help: "+line)
-
- def setupComic(self, scraper):
- """Setup the internal comic module from given scraper."""
- self.module = scraper()
- out.context = scraper.get_name()
- return self.module
-
- def useComics(self):
- """Set all comic modules for the defined comics."""
- for comic in self.comics:
c = comic.split(':', 2)
if len(c) > 1:
- self.indices = c[1].split(',')
+ indices = c[1].split(',')
else:
- self.indices = None
-
+ indices = None
moduleName = c[0]
- if moduleName == '@':
- for s in self.getExistingComics():
- yield self.setupComic(s)
- elif moduleName == '@@':
- for s in self.getScrapers():
- yield self.setupComic(s)
- else:
- yield self.setupComic(scraper.get(moduleName))
-
- def run(self, comics):
- """Execute comic commands."""
- self.setOutputInfo()
- self.comics = comics
-
- om = self.settings['output']
- events.installHandler(om, self.settings['basepath'], self.settings['baseurl'])
- events.handler.start()
-
- if self.settings['version']:
- displayVersion()
- elif self.settings['list']:
- self.doList(self.settings['list'] == 1)
- elif len(comics) <= 0:
- out.write('Warning: No comics specified, bailing out!')
- elif self.settings['modhelp']:
- self.doHelp()
- elif self.settings['catchup']:
- self.doCatchup()
- else:
- self.doCurrent()
-
- events.handler.end()
+ yield scraper.get_scraper(moduleName)(indices=indices)
def main():
"""Parse options and execute commands."""
try:
parser = setupOptions()
options, args = parser.parse_args()
- d = Dosage(options.__dict__)
- d.run(args)
- if d.errors:
- res = 1
- else:
- res = 0
+ res = run(options, args)
except KeyboardInterrupt:
print "Aborted."
res = 1
diff --git a/dosagelib/comic.py b/dosagelib/comic.py
index d289e73b7..fe4063912 100644
--- a/dosagelib/comic.py
+++ b/dosagelib/comic.py
@@ -6,8 +6,6 @@ import locale
import rfc822
import time
import shutil
-# XXX why is this done??
-locale.setlocale(locale.LC_ALL, '')
from .output import out
from .util import urlopen, saneDataSize, normaliseURL
@@ -18,16 +16,34 @@ class FetchComicError(IOError):
"""Exception for comic fetching errors."""
pass
-class Comic(object):
- """Download and save a single comic."""
+class ComicStrip(object):
+ """A list of comic image URLs."""
- def __init__(self, moduleName, url, referrer=None, filename=None):
+ def __init__(self, name, parentUrl, imageUrls, namer):
+ """Store the image URL list."""
+ self.name = name
+ self.parentUrl = parentUrl
+ self.imageUrls = imageUrls
+ self.namer = namer
+
+ def getImages(self):
+ """Get a list of image downloaders."""
+ for imageUrl in self.imageUrls:
+ yield self.getDownloader(normaliseURL(imageUrl))
+
+ def getDownloader(self, url):
+ filename = self.namer(url, self.parentUrl)
+ return ComicImage(self.name, self.parentUrl, url, filename)
+
+
+class ComicImage(object):
+ def __init__(self, name, referrer, url, filename):
"""Set URL and filename."""
- self.moduleName = moduleName
- self.url = normaliseURL(url)
+ self.name = name
self.referrer = referrer
+ self.url = url
if filename is None:
- filename = url.split('/')[-1]
+ filename = url.rsplit('/')[1]
self.filename, self.ext = os.path.splitext(filename)
self.filename = self.filename.replace(os.sep, '_')
self.ext = self.ext.replace(os.sep, '_')
@@ -62,13 +78,13 @@ class Comic(object):
def save(self, basepath, showProgress=False):
"""Save comic URL to filename on disk."""
self.connect()
- comicName, comicExt = self.filename, self.ext
+ filename = "%s%s" % (self.filename, self.ext)
comicSize = self.contentLength
- comicDir = os.path.join(basepath, self.moduleName.replace('/', os.sep))
+ comicDir = os.path.join(basepath, self.name.replace('/', os.sep))
if not os.path.isdir(comicDir):
os.makedirs(comicDir)
- fn = os.path.join(comicDir, '%s%s' % (self.filename, self.ext))
+ fn = os.path.join(comicDir, filename)
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
self.urlobj.close()
self.touch(fn)
@@ -76,10 +92,8 @@ class Comic(object):
return fn, False
try:
- tmpFn = os.path.join(comicDir, '__%s%s' % (self.filename, self.ext))
- out.write('Writing comic to temporary file %s...' % (tmpFn,), 3)
- comicOut = file(tmpFn, 'wb')
- try:
+ out.write('Writing comic to file %s...' % (fn,), 3)
+ with open(fn, 'wb') as comicOut:
startTime = time.time()
if showProgress:
def pollData():
@@ -92,12 +106,12 @@ class Comic(object):
else:
comicOut.write(self.urlobj.read())
endTime = time.time()
- finally:
- comicOut.close()
- out.write('Copying temporary file (%s) to %s...' % (tmpFn, fn), 3)
- shutil.copy2(tmpFn, fn)
self.touch(fn)
-
+ except:
+ if os.path.isfile(fn):
+ os.remove(fn)
+ raise
+ else:
size = os.path.getsize(fn)
bytes = locale.format('%d', size, True)
if endTime != startTime:
@@ -106,13 +120,8 @@ class Comic(object):
speed = '???'
attrs = dict(fn=fn, bytes=bytes, speed=speed)
out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
- handler.comicDownloaded(self.moduleName, fn)
- self.urlobj.close()
+ handler.comicDownloaded(self.name, fn)
finally:
- try:
- out.write('Removing temporary file %s...' % (tmpFn,), 3)
- os.remove(tmpFn)
- except:
- pass
+ self.urlobj.close()
return fn, True
diff --git a/dosagelib/configuration.py b/dosagelib/configuration.py
index ac8815dab..24325dd09 100644
--- a/dosagelib/configuration.py
+++ b/dosagelib/configuration.py
@@ -9,11 +9,13 @@ AppName = configdata.name
App = AppName+u" "+Version
Author = configdata.author
HtmlAuthor = Author.replace(u' ', u' ')
-Copyright = u"Copyright (C) 2004-2008 "+Author
-HtmlCopyright = u"Copyright © 2004-2008 "+HtmlAuthor
+Maintainer = configdata.maintainer
+HtmlMaintainer = Maintainer.replace(u' ', u' ')
+Copyright = u"Copyright (C) 2004-2008 "+Author+u", (C) 2012 "+Maintainer
+HtmlCopyright = u"Copyright © 2004-2008 "+HtmlAuthor+u", 2012 "+HtmlMaintainer
Url = configdata.url
SupportUrl = Url + u"/issues"
-Email = configdata.author_email
+Email = configdata.maintainer_email
UserAgent = u"Mozilla/5.0 (compatible; %s/%s; +%s)" % (AppName, Version, Url)
Freeware = AppName+u""" comes with ABSOLUTELY NO WARRANTY!
This is free software, and you are welcome to redistribute it
diff --git a/dosagelib/helpers.py b/dosagelib/helpers.py
index eb4be5dab..995546dc3 100644
--- a/dosagelib/helpers.py
+++ b/dosagelib/helpers.py
@@ -2,112 +2,10 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
import re
+import urlparse
-from .util import fetchUrl, fetchManyUrls, getQueryParams
-from .comic import Comic
-
-class _BasicScraper(object):
- '''Base class with scrape functions for comics.
-
- @type latestUrl: C{string}
- @cvar latestUrl: The URL for the latest comic strip.
- @type imageUrl: C{string}
- @cvar imageUrl: A string that is interpolated with the strip index
- to yield the URL for a particular strip.
- @type imageSearch: C{regex}
- @cvar imageSearch: A compiled regex that will locate the strip image URL
- when applied to the strip page.
- @type prevSearch: C{regex}
- @cvar prevSearch: A compiled regex that will locate the URL for the
- previous strip when applied to a strip page.
- '''
- referrer = None
- help = 'Sorry, no help for this comic yet.'
-
- def __init__(self):
- """Initialize internal variables."""
- self.currentUrl = None
- self.urls = set()
-
- def getReferrer(self, imageUrl, pageUrl):
- """Return referrer for HTTP connection."""
- return self.referrer or pageUrl or self.getLatestUrl()
-
- def getComic(self, url, pageUrl):
- """Get comic downloader for given URL and page."""
- if not url:
- return None
- return Comic(self.get_name(), url, filename=self.getFilename(url, pageUrl), referrer=self.getReferrer(url, pageUrl))
-
- def getCurrentComics(self):
- """Get list of current comics."""
- self.currentUrl = self.getLatestUrl()
- comics = self.getNextComics()
- if not comics:
- raise ValueError("Could not find current comic.")
- return comics
-
- def getNextComics(self):
- """Get all next comics."""
- comics = []
- while not comics and self.currentUrl and self.currentUrl not in self.urls:
- comicUrlGroups, prevUrl = fetchManyUrls(self.currentUrl, [self.imageSearch, self.prevSearch])
-
- if prevUrl:
- prevUrl = prevUrl[0]
- else:
- prevUrl = None
-
- for comicUrl in comicUrlGroups:
- comics.append(self.getComic(comicUrl, self.currentUrl))
-
- self.urls.update([self.currentUrl])
- self.currentUrl = (prevUrl, None)[prevUrl in self.urls]
- return comics
-
- def setStrip(self, index):
- """Set current comic strip URL."""
- self.currentUrl = self.imageUrl % index
-
- def getHelp(self):
- """Return help text for this scraper."""
- return self.help
-
- def __iter__(self):
- """Iterate through the strips, starting from the current one and going backward."""
- if not self.currentUrl:
- self.currentUrl = self.getLatestUrl()
- comics = True
- while comics:
- comics = self.getNextComics()
- if comics:
- yield comics
-
- @classmethod
- def get_name(cls):
- """Get scraper name."""
- if hasattr(cls, 'name'):
- return cls.name
- return cls.__name__
-
- @classmethod
- def starter(cls):
- """Get starter URL from where to scrape comic strips."""
- return cls.latestUrl
-
- @classmethod
- def namer(cls, imageUrl, pageUrl):
- """Return filename for given image and page URL."""
- return None
-
- def getFilename(self, imageUrl, pageUrl):
- """Return filename for given image and page URL."""
- return self.namer(imageUrl, pageUrl)
-
- def getLatestUrl(self):
- """Get starter URL from where to scrape comic strips."""
- return self.starter()
-
+from .util import fetchUrl, getQueryParams
+from .scraper import _BasicScraper
def queryNamer(paramName, usePageUrl=False):
"""Get name from URL query part."""
diff --git a/dosagelib/loader.py b/dosagelib/loader.py
new file mode 100644
index 000000000..29936d624
--- /dev/null
+++ b/dosagelib/loader.py
@@ -0,0 +1,74 @@
+# -*- coding: iso-8859-1 -*-
+
+import os
+import sys
+
+def get_modules(folder, importprefix):
+ """Find all valid modules in the plugins directory. A valid module
+ must have a .py extension, and is importable.
+ @return: all loaded valid modules
+ @rtype: iterator of module
+ """
+ for filename in get_importable_modules(folder):
+ try:
+ module = load_module(filename, importprefix)
+ if module is not None:
+ yield module
+ except StandardError, msg:
+ print "ERROR: could not load module %s: %s" % (filename, msg)
+
+
+def get_importable_modules(folder):
+ """Find all module files in the given folder that end witn '.py' and
+ don't start with an underscore.
+ @return module filenames
+ @rtype: iterator of string
+ """
+ for fname in os.listdir(folder):
+ if fname.endswith('.py') and not fname.startswith('_'):
+ yield os.path.join(folder, fname)
+
+
+def load_module(filename, importprefix):
+ """Load and return the module given by the filename.
+ Other exceptions than ImportError are not catched.
+ @return: loaded module or None on import errors
+ @rtype: module or None
+ """
+ name = os.path.splitext(os.path.basename(filename))[0]
+ modulename = "%s%s" % (importprefix, name)
+ __import__(modulename)
+ return sys.modules[modulename]
+
+
+def get_plugins(modules, classobj):
+ """Find all scrapers in all modules.
+ @param modules: the modules to search
+ @ptype modules: iterator of modules
+ @return: found scrapers
+ @rytpe: iterator of class objects
+ """
+ for module in modules:
+ for plugin in get_module_plugins(module, classobj):
+ yield plugin
+
+
+def get_module_plugins(module, classobj):
+ """Return all subclasses of _BasicScraper in the module.
+ If the module defines __all__, only those entries will be searched,
+ otherwise all objects not starting with '_' will be searched.
+ """
+ try:
+ names = module.__all__
+ except AttributeError:
+ names = [x for x in vars(module) if not x.startswith('_')]
+ for name in names:
+ try:
+ obj = getattr(module, name)
+ except AttributeError:
+ continue
+ try:
+ if issubclass(obj, classobj):
+ yield obj
+ except TypeError:
+ continue
diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py
index 824eb086d..ef4f45999 100644
--- a/dosagelib/plugins/a.py
+++ b/dosagelib/plugins/a.py
@@ -2,14 +2,14 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, MULTILINE
from ..util import tagre
-
-from ..helpers import _BasicScraper, regexNamer, bounceStarter, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import regexNamer, bounceStarter, indirectStarter
class ALessonIsLearned(_BasicScraper):
latestUrl = 'http://www.alessonislearned.com/'
imageUrl = 'http://www.alessonislearned.com/lesson%s.html'
- imageSearch = compile(tagre("img", "src", r"(cmx/.+?)"))
+ imageSearch = compile(tagre("img", "src", r"(cmx/lesson.+?)"))
prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=.+?)")+r".+?previous")
help = 'Index format: nnn'
diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py
index 0b9486bc6..04f75a28c 100644
--- a/dosagelib/plugins/b.py
+++ b/dosagelib/plugins/b.py
@@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile
-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper
class BadlyDrawnKitties(_BasicScraper):
diff --git a/dosagelib/plugins/c.py b/dosagelib/plugins/c.py
index ed82c6749..625362bef 100644
--- a/dosagelib/plugins/c.py
+++ b/dosagelib/plugins/c.py
@@ -2,18 +2,23 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile
-from ..helpers import (
- _BasicScraper, constStarter, bounceStarter, indirectStarter)
-from ..util import getQueryParams
+from ..scraper import _BasicScraper
+from ..helpers import constStarter, bounceStarter, indirectStarter
+from ..util import tagre, getQueryParams
class CalvinAndHobbes(_BasicScraper):
- latestUrl = 'http://www.gocomics.com/calvinandhobbes/'
+ starter = bounceStarter('http://www.gocomics.com/calvinandhobbes/',
+ compile(tagre("a", "href", "(/calvinandhobbes/\d+/\d+/\d+)")+"Next feature"))
imageUrl = 'http://www.gocomics.com/calvinandhobbes/%s'
- imageSearch = compile(r'src="(http://picayune\.uclick\.com/comics/ch/[^"]+\.gif)"')
- prevSearch = compile(r'href="(.*?)"\s+onclick="[^"]*">Previous day')
+ imageSearch = compile(tagre("img", "src", "(http://assets\.amuniversal\.com/[a-f0-9]+)"))
+ prevSearch = compile(tagre("a", "href", "(/calvinandhobbes/\d+/\d+/\d+)")+"Previous feature")
help = 'Index format: yyyy/mm/dd'
+ @classmethod
+ def namer(cls, imageUrl, pageUrl):
+ prefix, year, month, day = pageUrl.rsplit('/', 3)
+ return "%s%s%s.gif" % (year, month, day)
class CandyCartoon(_BasicScraper):
diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py
index ddadcab2b..9f3ec72c4 100644
--- a/dosagelib/plugins/d.py
+++ b/dosagelib/plugins/d.py
@@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE, MULTILINE
-from ..helpers import _BasicScraper, bounceStarter, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter, indirectStarter
from ..util import getQueryParams
diff --git a/dosagelib/plugins/drunkduck.py b/dosagelib/plugins/drunkduck.py
index 402c08d32..ea1b7e080 100644
--- a/dosagelib/plugins/drunkduck.py
+++ b/dosagelib/plugins/drunkduck.py
@@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper, bounceStarter, queryNamer
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter, queryNamer
def drunkDuck(shortName):
diff --git a/dosagelib/plugins/e.py b/dosagelib/plugins/e.py
index bc7eea357..3597b8148 100644
--- a/dosagelib/plugins/e.py
+++ b/dosagelib/plugins/e.py
@@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper, indirectStarter
+from ..helpers import indirectStarter
+from ..scraper import _BasicScraper
class EerieCuties(_BasicScraper):
diff --git a/dosagelib/plugins/f.py b/dosagelib/plugins/f.py
index f66d3b1fb..9f7cf6e2f 100644
--- a/dosagelib/plugins/f.py
+++ b/dosagelib/plugins/f.py
@@ -1,9 +1,10 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE, MULTILINE
-from ..util import tagre
-from ..helpers import _BasicScraper, indirectStarter
+from ..util import tagre
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter
class FalconTwin(_BasicScraper):
diff --git a/dosagelib/plugins/g.py b/dosagelib/plugins/g.py
index c5b2b140c..fcbc13f9f 100644
--- a/dosagelib/plugins/g.py
+++ b/dosagelib/plugins/g.py
@@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile
-from ..helpers import _BasicScraper, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter
class Galaxion(_BasicScraper):
diff --git a/dosagelib/plugins/h.py b/dosagelib/plugins/h.py
index 3e34fd0ea..228ca85ea 100644
--- a/dosagelib/plugins/h.py
+++ b/dosagelib/plugins/h.py
@@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile
-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper
class HappyMedium(_BasicScraper):
diff --git a/dosagelib/plugins/i.py b/dosagelib/plugins/i.py
index b5ed19056..3ac32c160 100644
--- a/dosagelib/plugins/i.py
+++ b/dosagelib/plugins/i.py
@@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper
class IDreamOfAJeanieBottle(_BasicScraper):
diff --git a/dosagelib/plugins/j.py b/dosagelib/plugins/j.py
index adbe635dd..b241e6fca 100644
--- a/dosagelib/plugins/j.py
+++ b/dosagelib/plugins/j.py
@@ -2,8 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, MULTILINE
-from ..helpers import _BasicScraper
-
+from ..scraper import _BasicScraper
class Jack(_BasicScraper):
diff --git a/dosagelib/plugins/k.py b/dosagelib/plugins/k.py
index 66027e024..9572532df 100644
--- a/dosagelib/plugins/k.py
+++ b/dosagelib/plugins/k.py
@@ -2,8 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper
-
+from ..scraper import _BasicScraper
class KernelPanic(_BasicScraper):
diff --git a/dosagelib/plugins/keenspot.py b/dosagelib/plugins/keenspot.py
index bbf5aa697..6f1ccee53 100644
--- a/dosagelib/plugins/keenspot.py
+++ b/dosagelib/plugins/keenspot.py
@@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper
def keenSpot(comics):
diff --git a/dosagelib/plugins/l.py b/dosagelib/plugins/l.py
index f662b38d6..a91a14aa2 100644
--- a/dosagelib/plugins/l.py
+++ b/dosagelib/plugins/l.py
@@ -2,8 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile
-from ..helpers import _BasicScraper, indirectStarter
-
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter
class LasLindas(_BasicScraper):
diff --git a/dosagelib/plugins/m.py b/dosagelib/plugins/m.py
index 5849db562..03e80a794 100644
--- a/dosagelib/plugins/m.py
+++ b/dosagelib/plugins/m.py
@@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper, queryNamer
+from ..scraper import _BasicScraper
+from ..helpers import queryNamer
class MadamAndEve(_BasicScraper):
diff --git a/dosagelib/plugins/n.py b/dosagelib/plugins/n.py
index 66ac9b95f..78c2e9a75 100644
--- a/dosagelib/plugins/n.py
+++ b/dosagelib/plugins/n.py
@@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper, indirectStarter, _PHPScraper
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter, _PHPScraper
diff --git a/dosagelib/plugins/num.py b/dosagelib/plugins/num.py
index b20c4f9d4..77ef10d8b 100644
--- a/dosagelib/plugins/num.py
+++ b/dosagelib/plugins/num.py
@@ -2,8 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile
-from ..helpers import _BasicScraper
-
+from ..scraper import _BasicScraper
class NineteenNinetySeven(_BasicScraper):
diff --git a/dosagelib/plugins/o.py b/dosagelib/plugins/o.py
index 726086bdb..56d8f261a 100644
--- a/dosagelib/plugins/o.py
+++ b/dosagelib/plugins/o.py
@@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter
class OctopusPie(_BasicScraper):
diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py
index 496d2926c..73253af9d 100644
--- a/dosagelib/plugins/p.py
+++ b/dosagelib/plugins/p.py
@@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper, bounceStarter, queryNamer
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter, queryNamer
class PartiallyClips(_BasicScraper):
diff --git a/dosagelib/plugins/q.py b/dosagelib/plugins/q.py
index c721317b7..02739a82f 100644
--- a/dosagelib/plugins/q.py
+++ b/dosagelib/plugins/q.py
@@ -2,8 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile
-from ..helpers import _BasicScraper
-
+from ..scraper import _BasicScraper
class QuestionableContent(_BasicScraper):
diff --git a/dosagelib/plugins/r.py b/dosagelib/plugins/r.py
index 58016469f..c88466335 100644
--- a/dosagelib/plugins/r.py
+++ b/dosagelib/plugins/r.py
@@ -2,8 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile
-from ..helpers import _BasicScraper, bounceStarter
-
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter
class RadioactivePanda(_BasicScraper):
diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py
index eb56bb114..50b795279 100644
--- a/dosagelib/plugins/s.py
+++ b/dosagelib/plugins/s.py
@@ -3,7 +3,8 @@
from re import compile, MULTILINE, IGNORECASE, sub
from os.path import splitext
-from ..helpers import _BasicScraper, bounceStarter, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter, indirectStarter
class SailorsunOrg(_BasicScraper):
diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py
index c9b8559d4..7e93fbf8f 100644
--- a/dosagelib/plugins/t.py
+++ b/dosagelib/plugins/t.py
@@ -2,8 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper, indirectStarter
-
+from ..scraper import _BasicScraper
+from ..helpers import indirectStarter
class TalesOfPylea(_BasicScraper):
diff --git a/dosagelib/plugins/u.py b/dosagelib/plugins/u.py
index 164104d38..d3c115a71 100644
--- a/dosagelib/plugins/u.py
+++ b/dosagelib/plugins/u.py
@@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE
-from ..helpers import _BasicScraper, bounceStarter, indirectStarter
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter, indirectStarter
from ..util import getQueryParams
diff --git a/dosagelib/plugins/uc.py b/dosagelib/plugins/uc.py
index 78e88e350..72a245557 100644
--- a/dosagelib/plugins/uc.py
+++ b/dosagelib/plugins/uc.py
@@ -2,8 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE, sub
-from ..helpers import _BasicScraper
-from ..util import fetchManyMatches, fetchUrl
+from ..scraper import _BasicScraper
+from ..util import fetchUrl
class _UClickScraper(_BasicScraper):
@@ -24,6 +24,7 @@ class _UClickScraper(_BasicScraper):
'index',
)
+ # XXX refactor this mess
submoduleSearch = compile(r'([^>]+?)', IGNORECASE)
partsMatch = compile(r'([^>]+?)', IGNORECASE)
matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]
diff --git a/dosagelib/plugins/v.py b/dosagelib/plugins/v.py
index eae02598b..109db8041 100644
--- a/dosagelib/plugins/v.py
+++ b/dosagelib/plugins/v.py
@@ -2,8 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE, MULTILINE
-from ..helpers import _BasicScraper
-
+from ..scraper import _BasicScraper
class _VGCats(_BasicScraper):
diff --git a/dosagelib/plugins/w.py b/dosagelib/plugins/w.py
index 6ac858607..0aaedd9af 100644
--- a/dosagelib/plugins/w.py
+++ b/dosagelib/plugins/w.py
@@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE, DOTALL
-from ..helpers import _BasicScraper, queryNamer, bounceStarter
+from ..scraper import _BasicScraper
+from ..helpers import queryNamer, bounceStarter
class WayfarersMoon(_BasicScraper):
diff --git a/dosagelib/plugins/x.py b/dosagelib/plugins/x.py
index d9861822c..6ba59215f 100644
--- a/dosagelib/plugins/x.py
+++ b/dosagelib/plugins/x.py
@@ -2,7 +2,9 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile
-from ..helpers import _BasicScraper, bounceStarter
+from ..scraper import _BasicScraper
+from ..helpers import bounceStarter
+
class xkcd(_BasicScraper):
starter = bounceStarter('http://xkcd.com/', compile(r']*>Next'))
diff --git a/dosagelib/plugins/y.py b/dosagelib/plugins/y.py
index 13f030d0d..a1bef31b5 100644
--- a/dosagelib/plugins/y.py
+++ b/dosagelib/plugins/y.py
@@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, MULTILINE
-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper
class YAFGC(_BasicScraper):
diff --git a/dosagelib/plugins/z.py b/dosagelib/plugins/z.py
index 492c0ea5e..88ad48fa7 100644
--- a/dosagelib/plugins/z.py
+++ b/dosagelib/plugins/z.py
@@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile
-from ..helpers import _BasicScraper
+from ..scraper import _BasicScraper
class Zapiro(_BasicScraper):
diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py
index 6a3621744..a94590927 100644
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@@ -2,47 +2,122 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
import os
-import sys
-
-from .helpers import _BasicScraper
+from . import loader
+from .util import fetchUrls
+from .comic import ComicStrip
disabled = []
def init_disabled():
filename = os.path.expanduser('~/.dosage/disabled')
- if not os.path.isfile(filename):
- return
- with open(filename) as f:
- for line in f:
- if line and not line.startswith('#'):
- disabled.append(line.rstrip())
+ if os.path.isfile(filename):
+ with open(filename) as f:
+ for line in f:
+ if line and not line.startswith('#'):
+ disabled.append(line.rstrip())
init_disabled()
class DisabledComicError(ValueError):
pass
-def get(comicName):
+class _BasicScraper(object):
+ '''Base class with scrape functions for comics.
+
+ @type latestUrl: C{string}
+ @cvar latestUrl: The URL for the latest comic strip.
+ @type imageUrl: C{string}
+ @cvar imageUrl: A string that is interpolated with the strip index
+ to yield the URL for a particular strip.
+ @type imageSearch: C{regex}
+ @cvar imageSearch: A compiled regex that will locate the strip image URL
+ when applied to the strip page.
+ @type prevSearch: C{regex}
+ @cvar prevSearch: A compiled regex that will locate the URL for the
+ previous strip when applied to a strip page.
+ '''
+ help = 'Sorry, no help for this comic yet.'
+
+ def __init__(self, indices=None):
+ """Initialize internal variables."""
+ self.urls = set()
+ self.indices = indices
+
+ def getCurrentStrip(self):
+ """Get current comic strip."""
+ return self.getStrip(self.getLatestUrl())
+
+ def getStrip(self, url):
+ """Get comic strip for given URL."""
+ imageUrls = fetchUrls(url, self.imageSearch)
+ return self.getComicStrip(url, imageUrls)
+
+ def getComicStrip(self, url, imageUrls):
+ """Get comic strip downloader for given URL and images."""
+ return ComicStrip(self.get_name(), url, imageUrls, self.namer)
+
+ def getAllStrips(self):
+ """Get all comic strips."""
+ seen_urls = set()
+ url = self.getLatestUrl()
+ while url:
+ imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
+ seen_urls.add(url)
+ yield self.getComicStrip(url, imageUrls)
+ # avoid recursive URL loops
+ url = prevUrl if prevUrl not in seen_urls else None
+
+ def setStrip(self, index):
+ """Set current comic strip URL."""
+ self.currentUrl = self.imageUrl % index
+
+ def getHelp(self):
+ """Return help text for this scraper."""
+ return self.help
+
+ @classmethod
+ def get_name(cls):
+ """Get scraper name."""
+ if hasattr(cls, 'name'):
+ return cls.name
+ return cls.__name__
+
+ @classmethod
+ def starter(cls):
+ """Get starter URL from where to scrape comic strips."""
+ return cls.latestUrl
+
+ @classmethod
+ def namer(cls, imageUrl, pageUrl):
+ """Return filename for given image and page URL."""
+ return None
+
+ def getFilename(self, imageUrl, pageUrl):
+ """Return filename for given image and page URL."""
+ return self.namer(imageUrl, pageUrl)
+
+ def getLatestUrl(self):
+ """Get starter URL from where to scrape comic strips."""
+ return self.starter()
+
+
+def get_scraper(comic):
"""Returns a comic module object."""
candidates = []
- for scraper in get_scrapers():
- lname = scraper.get_name().lower()
- cname = comicName.lower()
+ cname = comic.lower()
+ for scraperclass in get_scrapers():
+ lname = scraperclass.get_name().lower()
if lname == cname:
# perfect match
- return scraper
+ return scraperclass
if cname in lname:
- candidates.append(scraper)
+ candidates.append(scraperclass)
if len(candidates) == 1:
return candidates[0]
elif candidates:
comics = ", ".join(x.get_name() for x in candidates)
raise ValueError('Multiple comics %s found.' % comics)
else:
- raise ValueError('Comic %r not found.' % comicName)
-
-
-def items():
- return get_scrapers()
+ raise ValueError('Comic %r not found.' % comic)
_scrapers = None
@@ -54,91 +129,23 @@ def get_scrapers():
"""
global _scrapers
if _scrapers is None:
- _scrapers = list(get_all_plugins(get_modules()))
+ folder = os.path.join(os.path.dirname(__file__), 'plugins')
+ importprefix = 'dosagelib.plugins.'
+ modules = loader.get_modules(folder, importprefix)
+ plugins = loader.get_plugins(modules, _BasicScraper)
+ _scrapers = list(plugins)
_scrapers.sort(key=lambda s: s.get_name())
check_scrapers()
return _scrapers
def check_scrapers():
+ """Check for duplicate scraper class names."""
d = {}
- for s in _scrapers:
- name = s.get_name().lower()
+ for scraperclass in _scrapers:
+ name = scraperclass.get_name().lower()
if name in d:
- name1 = s.get_name()
+ name1 = scraperclass.get_name()
name2 = d[name].get_name()
raise ValueError('Duplicate scrapers %s and %s found' % (name1, name2))
- d[name] = s
-
-
-def get_modules():
- """Find all valid modules in the plugins directory. A valid module
- must have a .py extension, and is importable.
- @return: all loaded valid modules
- @rtype: iterator of module
- """
- # load from the plugins folder
- folder = os.path.join(os.path.dirname(__file__), 'plugins')
- for filename in get_importable_modules(folder):
- try:
- module = load_module(filename)
- if module is not None:
- yield module
- except StandardError, msg:
- print "ERROR", msg
-
-
-def get_importable_modules(folder):
- """Find all module files in the given folder that end witn '.py' and
- don't start with an underscore.
- @return module filenames
- @rtype: iterator of string
- """
- for fname in os.listdir(folder):
- if fname.endswith('.py') and not fname.startswith('_'):
- yield os.path.join(folder, fname)
-
-
-def load_module(filename):
- """Load and return the module given by the filename.
- Other exceptions than ImportError are not catched.
- @return: loaded module or None on import errors
- @rtype: module or None
- """
- name = os.path.splitext(os.path.basename(filename))[0]
- modulename = "dosagelib.plugins.%s" % name
- __import__(modulename)
- return sys.modules[modulename]
-
-
-def get_all_plugins(modules):
- """Find all scrapers in all modules.
- @param modules: the modules to search
- @ptype modules: iterator of modules
- @return: found scrapers
- @rytpe: iterator of class objects
- """
- for module in modules:
- for plugin in get_plugins(module):
- yield plugin
-
-
-def get_plugins(module):
- """Return all subclasses of _BasicScraper in the module.
- If the module defines __all__, only those entries will be searched,
- otherwise all objects not starting with '_' will be searched.
- """
- try:
- names = module.__all__
- except AttributeError:
- names = [x for x in vars(module) if not x.startswith('_')]
- for name in names:
- try:
- obj = getattr(module, name)
- except AttributeError:
- continue
- try:
- if issubclass(obj, _BasicScraper):
- yield obj
- except TypeError:
- continue
+ d[name] = scraperclass
diff --git a/dosagelib/util.py b/dosagelib/util.py
index 27231a708..f95fd06c6 100644
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@@ -21,72 +21,90 @@ if os.name == 'nt':
has_curses = has_module("curses")
-class NoMatchError(Exception):
- pass
+MAX_FILESIZE = 1024*1024*1 # 1MB
-def getMatchValues(matches):
- return set([match.group(1) for match in matches])
+def tagre(tag, attribute, value):
+ """Return a regular expression matching the given HTML tag, attribute
+ and value. It matches the tag and attribute names case insensitive,
+ and skips arbitrary whitespace and leading HTML attributes. The "<>" at
+ the start and end of the HTML tag is also matched.
+ @param tag: the tag name
+ @ptype tag: string
+ @param attribute: the attribute name
+ @ptype attribute: string
+ @param value: the attribute value
+ @ptype value: string
+ @return: the generated regular expression suitable for re.compile()
+ @rtype: string
+ """
+ attrs = dict(
+ tag=case_insensitive_re(tag),
+ attribute=case_insensitive_re(attribute),
+ value=value,
+ )
+ return r'<\s*%(tag)s[^>]*\s+%(attribute)s\s*=\s*"%(value)s"[^>]*/?>' % attrs
-def fetchManyMatches(url, regexes):
- '''Returns a list containing lists of matches for each regular expression, in the same order.'''
- out.write('Matching regex(es) %r multiple times against %s...' % ([rex.pattern for rex in regexes], url), 2)
+
+def case_insensitive_re(name):
+ """Reformat the given name to a case insensitive regular expression string
+ without using re.IGNORECASE. This way selective strings can be made case
+ insensitive.
+ @param name: the name to make case insensitive
+ @ptype name: string
+ @return: the case insenstive regex
+ @rtype: string
+ """
+ return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
+
+
+baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
+
+def getPageContent(url):
+ # read page data
page = urlopen(url)
- data = page.read()
-
- matches = [getMatchValues(regex.finditer(data)) for regex in regexes]
- if matches:
- out.write('...found %r' % (matches,), 2)
- else:
- out.write('...not found!', 2)
-
- return list(matches)
-
-def fetchMatches(url, regexes):
- out.write('Matching regex(es) %r against %s...' % ([rex.pattern for rex in regexes], url), 2)
- page = urlopen(url)
- data = page.read()
-
- matches = []
- for regex in regexes:
- match = regex.search(data)
- if match:
- matches.append(match.group(1))
-
- if matches:
- out.write('...found %r' % (matches,), 2)
- else:
- out.write('...not found!', 2)
-
- return matches
-
-def fetchMatch(url, regex):
- matches = fetchMatches(url, (regex,))
- if matches:
- return matches[0]
- return None
-
-def fetchUrl(url, regex):
- match = fetchMatch(url, regex)
+ data = page.read(MAX_FILESIZE)
+ # determine base URL
+ baseUrl = None
+ match = baseSearch.search(data)
if match:
- return urlparse.urljoin(url, match)
+ baseUrl = match.group(1)
+ else:
+ baseUrl = url
+ return data, baseUrl
+
+
+def fetchUrl(url, searchRo):
+ data, baseUrl = getPageContent(url)
+ match = searchRo.search(data)
+ if match:
+ searchUrl = match.group(1)
+ out.write('matched URL %r' % searchUrl, 2)
+ return urlparse.urljoin(baseUrl, searchUrl)
return None
-baseSearch = re.compile(r'', re.IGNORECASE)
-def fetchUrls(url, regexes):
- matches = fetchMatches(url, [baseSearch] + list(regexes))
- baseUrl = matches.pop(0) or url
- return [urlparse.urljoin(baseUrl, match) for match in matches]
-def fetchManyUrls(url, regexes):
- matchGroups = fetchManyMatches(url, [baseSearch] + list(regexes))
- baseUrl = matchGroups.pop(0) or [url]
- baseUrl = baseUrl[0]
+def fetchUrls(url, imageSearch, prevSearch=None):
+ data, baseUrl = getPageContent(url)
+ # match images
+ imageUrls = set()
+ for match in imageSearch.finditer(data):
+ imageUrl = match.group(1)
+ out.write('matched image URL %r' % imageUrl, 2)
+ imageUrls.add(urlparse.urljoin(baseUrl, imageUrl))
+ if not imageUrls:
+ raise ValueError("No images found at %s with pattern %s" % (url, imageSearch.pattern))
+ if prevSearch is not None:
+ # match previous URL
+ match = prevSearch.search(data)
+ if match:
+ prevUrl = match.group(1)
+ out.write('matched previous URL %r' % prevUrl, 2)
+ prevUrl = urlparse.urljoin(baseUrl, prevUrl)
+ else:
+ prevUrl = None
+ return imageUrls, prevUrl
+ return imageUrls
- xformedGroups = []
- for matchGroup in matchGroups:
- xformedGroups.append([urlparse.urljoin(baseUrl, match) for match in matchGroup])
-
- return xformedGroups
def _unescape(text):
"""
@@ -278,37 +296,3 @@ def strtimezone():
else:
zone = time.timezone
return "%+04d" % (-zone//3600)
-
-
-def tagre(tag, attribute, value):
- """Return a regular expression matching the given HTML tag, attribute
- and value. It matches the tag and attribute names case insensitive,
- and skips arbitrary whitespace and leading HTML attributes. The "<>" at
- the start and end of the HTML tag is also matched.
- @param tag: the tag name
- @ptype tag: string
- @param attribute: the attribute name
- @ptype attribute: string
- @param value: the attribute value
- @ptype value: string
- @return: the generated regular expression suitable for re.compile()
- @rtype: string
- """
- attrs = dict(
- tag=case_insensitive_re(tag),
- attribute=case_insensitive_re(attribute),
- value=value,
- )
- return r'<\s*%(tag)s[^>]*\s+%(attribute)s\s*=\s*"%(value)s"[^>]>' % attrs
-
-def case_insensitive_re(name):
- """Reformat the given name to a case insensitive regular expression string
- without using re.IGNORECASE. This way selective strings can be made case
- insensitive.
- @param name: the name to make case insensitive
- @ptype name: string
- @return: the case insenstive regex
- @rtype: string
- """
- return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
-