A lot of refactoring.

This commit is contained in:
Bastian Kleineidam 2012-10-11 12:03:12 +02:00
parent 4ba973abf5
commit c707aa893d
37 changed files with 472 additions and 551 deletions

240
dosage
View file

@ -20,7 +20,6 @@
import sys import sys
import os import os
import optparse import optparse
import traceback
from dosagelib import events, scraper from dosagelib import events, scraper
from dosagelib.output import out from dosagelib.output import out
@ -35,13 +34,13 @@ def setupOptions():
usage = 'usage: %prog [options] comicModule [comicModule ...]' usage = 'usage: %prog [options] comicModule [comicModule ...]'
parser = optparse.OptionParser(usage=usage) parser = optparse.OptionParser(usage=usage)
parser.add_option('-v', '--verbose', action='count', dest='verbose', default=0, help='provides verbose output, use multiple times for more verbosity') parser.add_option('-v', '--verbose', action='count', dest='verbose', default=0, help='provides verbose output, use multiple times for more verbosity')
parser.add_option('-c', '--catch-up', action='count', dest='catchup', default=None, help='traverse and retrieve all available comics up until the strip that already exists locally, use twice to retrieve until all strips exist locally') parser.add_option('-c', '--catchup', action='count', dest='catchup', default=None, help='traverse and retrieve all available comics up until the strip that already exists locally, use twice to retrieve until all strips exist locally')
parser.add_option('-b', '--base-path', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH') parser.add_option('-b', '--basepath', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH')
parser.add_option('--base-url', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH') parser.add_option('--baseurl', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH')
parser.add_option('-l', '--list', action='store_const', const=1, dest='list', help='list available comic modules') parser.add_option('-l', '--list', action='store_const', const=1, dest='list', help='list available comic modules')
parser.add_option('--single-list', action='store_const', const=2, dest='list', help='list available comic modules in a single list') parser.add_option('--singlelist', action='store_const', const=2, dest='list', help='list available comic modules in a single list')
parser.add_option('-V', '--version', action='store_true', dest='version', help='display the version number') parser.add_option('-V', '--version', action='store_true', dest='version', help='display the version number')
parser.add_option('-m', '--module-help', action='store_true', dest='modhelp', help='display help for comic modules') parser.add_option('-m', '--modulehelp', action='store_true', dest='modhelp', help='display help for comic modules')
parser.add_option('-t', '--timestamps', action='store_true', dest='timestamps', default=False, help='print timestamps for all output at any info level') parser.add_option('-t', '--timestamps', action='store_true', dest='timestamps', default=False, help='print timestamps for all output at any info level')
parser.add_option('-o', '--output', action='store', dest='output', choices=events.getHandlers(), help='output formatting for downloaded comics') parser.add_option('-o', '--output', action='store', dest='output', choices=events.getHandlers(), help='output formatting for downloaded comics')
if is_tty(sys.stdout): if is_tty(sys.stdout):
@ -54,196 +53,129 @@ def displayVersion():
print App print App
print Copyright print Copyright
print Freeware print Freeware
return 0
class Dosage(object): def setOutputInfo(options):
"""Main program executing comic commands."""
def __init__(self, settings):
"""Store settings and initialize internal variables."""
self.settings = settings
self.errors = 0
def setOutputInfo(self):
"""Set global output level and timestamp option.""" """Set global output level and timestamp option."""
out.level = 0 out.level = 0
out.level += self.settings['verbose'] out.level += options.verbose
out.timestamps = self.settings['timestamps'] out.timestamps = options.timestamps
def saveComic(self, comic):
"""Save one comic strip in an output file."""
basepath = self.settings['basepath']
progress = self.settings.get('progress', False)
fn, saved = comic.save(basepath, progress)
return saved
def saveComics(self, comics): def saveComicStrip(strip, basepath, progress):
"""Save a list of comics.""" """Save a comic strip which can consist of multiple images."""
saved = False errors = 0
for comic in comics: for image in strip.getImages():
saved = self.saveComic(comic) or saved
return saved
def safeOp(self, fp, *args, **kwargs):
"""Run a function and catch and report any errors."""
try: try:
fp(*args, **kwargs) image.save(basepath, progress)
except Exception: except IOError, msg:
self.errors += 1 out.write('Error saving %s: %s' % (image.filename, msg))
type, value, tb = sys.exc_info() errors += 1
out.write('Traceback (most recent call last):', 1) return errors
out.writelines(traceback.format_stack(), 1)
out.writelines(traceback.format_tb(tb)[1:], 1)
out.writelines(traceback.format_exception_only(type, value))
def getCurrent(self):
"""Retrieve and save all current comic strips."""
out.write('Retrieving the current strip...')
self.saveComics(self.module.getCurrentComics())
def getIndex(self, index): def displayHelp(comics, basepath):
"""Retrieve comcis with given index.""" """Print help for comic strips."""
out.write('Retrieving index "%s"....' % (index,)) for scraperobj in getScrapers(comics, basepath):
try: for line in scraperobj.getHelp().splitlines():
self.module.setStrip(index) out.write("Help: "+line)
self.saveComics(self.module.getNextComics()) return 0
except NotImplementedError:
out.write('No indexed retrieval support.')
def catchup(self): def getComics(options, comics):
"""Save all comics until the current date.""" errors = 0
events.installHandler(options.output, options.basepath, options.baseurl)
events.handler.start()
for scraperobj in getScrapers(comics, options.basepath):
out.context = scraperobj.get_name()
if options.catchup:
out.write('Catching up...') out.write('Catching up...')
for comics in self.module: strips = scraperobj.getAllStrips()
if not self.saveComics(comics) and self.settings['catchup'] < 2: else:
break out.write('Retrieving the current strip...')
strips = [scraperobj.getCurrentStrip()]
for strip in strips:
errors += saveComicStrip(strip, options.basepath, options.progress)
events.handler.end()
return errors
def catchupIndex(self, index):
"""Retrieve and save all comics from the given index."""
out.write('Catching up from index "%s"...' % (index,))
self.module.setStrip(index)
for comics in self.module:
if not self.saveComics(comics) and self.settings['catchup'] < 2:
break
def getScrapers(self): def run(options, comics):
"""Get list of scraper objects.""" """Execute comic commands."""
return scraper.items() setOutputInfo(options)
if options.version:
return displayVersion()
if options.list:
return doList(options.list == 1)
if len(comics) <= 0:
out.write('Warning: No comics specified, bailing out!')
return 1
if options.modhelp:
return displayHelp(comics, options.basepath)
errors = getComics(options, comics)
def getExistingComics(self):
"""Get all existing comic scrapers."""
for scraper in self.getScrapers():
dirname = scraper.get_name().replace('/', os.sep)
if os.path.isdir(os.path.join(self.settings['basepath'], dirname)):
yield scraper
def doList(self, columnList): def doList(columnList):
"""List available comics.""" """List available comics."""
out.write('Available comic scrapers:') out.write('Available comic scrapers:')
scrapers = self.getScrapers() scrapers = getScrapers(['@@'])
if len(scrapers) > 0:
if columnList: if columnList:
self.doColumnList(scrapers) doColumnList(scrapers)
else: else:
self.doSingleList(scrapers) doSingleList(scrapers)
out.write('%d supported comics.' % len(scrapers)) out.write('%d supported comics.' % len(scrapers))
return 0
def doSingleList(self, scrapers):
def doSingleList(scrapers):
"""Get list of scraper names, one per line.""" """Get list of scraper names, one per line."""
print '\n'.join(scraper.get_name() for scraper in scrapers) print '\n'.join(scraperobj.get_name() for scraperobj in scrapers)
def doColumnList(self, scrapers):
def doColumnList(scrapers):
"""Get list of scraper names with multiple names per line.""" """Get list of scraper names with multiple names per line."""
screenWidth = get_columns() screenWidth = get_columns()
names = [scraper.get_name() for scraper in scrapers] names = [scraperobj.get_name() for scraperobj in scrapers]
maxlen = max([len(name) for name in names]) maxlen = max([len(name) for name in names])
namesPerLine = int(screenWidth / (maxlen + 1)) namesPerLine = int(screenWidth / (maxlen + 1))
while names: while names:
print ''.join([name.ljust(maxlen) for name in names[:namesPerLine]]) print ''.join([name.ljust(maxlen) for name in names[:namesPerLine]])
del names[:namesPerLine] del names[:namesPerLine]
def doCatchup(self):
"""Catchup comics.""" def getScrapers(comics, basepath=None):
for comic in self.useComics(): """Get scraper objects for the given comics."""
if self.indices: if '@' in comics:
self.safeOp(self.catchupIndex, self.indices[0]) # only scrapers whose directory already exists
if len(comics) > 1:
out.write("WARN: using '@' as comic name ignores all other specified comics.\n")
for scraperclass in scraper.get_scrapers():
dirname = scraperclass.get_name().replace('/', os.sep)
if os.path.isdir(os.path.join(basepath, dirname)):
yield scraperclass()
elif '@@' in comics:
# all scrapers
if len(comics) > 1:
out.write("WARN: using '@@' as comic name ignores all other specified comics.\n")
for scraperclass in scraper.get_scrapers():
yield scraperclass()
else: else:
self.safeOp(self.catchup) # only selected
for comic in comics:
def doCurrent(self):
"""Get current comics."""
for comic in self.useComics():
if self.indices:
for index in self.indices:
self.safeOp(self.getIndex, index)
else:
self.safeOp(self.getCurrent)
def doHelp(self):
"""Print help for comic strips."""
for scraper in self.useComics():
for line in scraper.getHelp().splitlines():
out.write("Help: "+line)
def setupComic(self, scraper):
"""Setup the internal comic module from given scraper."""
self.module = scraper()
out.context = scraper.get_name()
return self.module
def useComics(self):
"""Set all comic modules for the defined comics."""
for comic in self.comics:
c = comic.split(':', 2) c = comic.split(':', 2)
if len(c) > 1: if len(c) > 1:
self.indices = c[1].split(',') indices = c[1].split(',')
else: else:
self.indices = None indices = None
moduleName = c[0] moduleName = c[0]
if moduleName == '@': yield scraper.get_scraper(moduleName)(indices=indices)
for s in self.getExistingComics():
yield self.setupComic(s)
elif moduleName == '@@':
for s in self.getScrapers():
yield self.setupComic(s)
else:
yield self.setupComic(scraper.get(moduleName))
def run(self, comics):
"""Execute comic commands."""
self.setOutputInfo()
self.comics = comics
om = self.settings['output']
events.installHandler(om, self.settings['basepath'], self.settings['baseurl'])
events.handler.start()
if self.settings['version']:
displayVersion()
elif self.settings['list']:
self.doList(self.settings['list'] == 1)
elif len(comics) <= 0:
out.write('Warning: No comics specified, bailing out!')
elif self.settings['modhelp']:
self.doHelp()
elif self.settings['catchup']:
self.doCatchup()
else:
self.doCurrent()
events.handler.end()
def main(): def main():
"""Parse options and execute commands.""" """Parse options and execute commands."""
try: try:
parser = setupOptions() parser = setupOptions()
options, args = parser.parse_args() options, args = parser.parse_args()
d = Dosage(options.__dict__) res = run(options, args)
d.run(args)
if d.errors:
res = 1
else:
res = 0
except KeyboardInterrupt: except KeyboardInterrupt:
print "Aborted." print "Aborted."
res = 1 res = 1

View file

@ -6,8 +6,6 @@ import locale
import rfc822 import rfc822
import time import time
import shutil import shutil
# XXX why is this done??
locale.setlocale(locale.LC_ALL, '')
from .output import out from .output import out
from .util import urlopen, saneDataSize, normaliseURL from .util import urlopen, saneDataSize, normaliseURL
@ -18,16 +16,34 @@ class FetchComicError(IOError):
"""Exception for comic fetching errors.""" """Exception for comic fetching errors."""
pass pass
class Comic(object): class ComicStrip(object):
"""Download and save a single comic.""" """A list of comic image URLs."""
def __init__(self, moduleName, url, referrer=None, filename=None): def __init__(self, name, parentUrl, imageUrls, namer):
"""Store the image URL list."""
self.name = name
self.parentUrl = parentUrl
self.imageUrls = imageUrls
self.namer = namer
def getImages(self):
"""Get a list of image downloaders."""
for imageUrl in self.imageUrls:
yield self.getDownloader(normaliseURL(imageUrl))
def getDownloader(self, url):
filename = self.namer(url, self.parentUrl)
return ComicImage(self.name, self.parentUrl, url, filename)
class ComicImage(object):
def __init__(self, name, referrer, url, filename):
"""Set URL and filename.""" """Set URL and filename."""
self.moduleName = moduleName self.name = name
self.url = normaliseURL(url)
self.referrer = referrer self.referrer = referrer
self.url = url
if filename is None: if filename is None:
filename = url.split('/')[-1] filename = url.rsplit('/')[1]
self.filename, self.ext = os.path.splitext(filename) self.filename, self.ext = os.path.splitext(filename)
self.filename = self.filename.replace(os.sep, '_') self.filename = self.filename.replace(os.sep, '_')
self.ext = self.ext.replace(os.sep, '_') self.ext = self.ext.replace(os.sep, '_')
@ -62,13 +78,13 @@ class Comic(object):
def save(self, basepath, showProgress=False): def save(self, basepath, showProgress=False):
"""Save comic URL to filename on disk.""" """Save comic URL to filename on disk."""
self.connect() self.connect()
comicName, comicExt = self.filename, self.ext filename = "%s%s" % (self.filename, self.ext)
comicSize = self.contentLength comicSize = self.contentLength
comicDir = os.path.join(basepath, self.moduleName.replace('/', os.sep)) comicDir = os.path.join(basepath, self.name.replace('/', os.sep))
if not os.path.isdir(comicDir): if not os.path.isdir(comicDir):
os.makedirs(comicDir) os.makedirs(comicDir)
fn = os.path.join(comicDir, '%s%s' % (self.filename, self.ext)) fn = os.path.join(comicDir, filename)
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize: if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
self.urlobj.close() self.urlobj.close()
self.touch(fn) self.touch(fn)
@ -76,10 +92,8 @@ class Comic(object):
return fn, False return fn, False
try: try:
tmpFn = os.path.join(comicDir, '__%s%s' % (self.filename, self.ext)) out.write('Writing comic to file %s...' % (fn,), 3)
out.write('Writing comic to temporary file %s...' % (tmpFn,), 3) with open(fn, 'wb') as comicOut:
comicOut = file(tmpFn, 'wb')
try:
startTime = time.time() startTime = time.time()
if showProgress: if showProgress:
def pollData(): def pollData():
@ -92,12 +106,12 @@ class Comic(object):
else: else:
comicOut.write(self.urlobj.read()) comicOut.write(self.urlobj.read())
endTime = time.time() endTime = time.time()
finally:
comicOut.close()
out.write('Copying temporary file (%s) to %s...' % (tmpFn, fn), 3)
shutil.copy2(tmpFn, fn)
self.touch(fn) self.touch(fn)
except:
if os.path.isfile(fn):
os.remove(fn)
raise
else:
size = os.path.getsize(fn) size = os.path.getsize(fn)
bytes = locale.format('%d', size, True) bytes = locale.format('%d', size, True)
if endTime != startTime: if endTime != startTime:
@ -106,13 +120,8 @@ class Comic(object):
speed = '???' speed = '???'
attrs = dict(fn=fn, bytes=bytes, speed=speed) attrs = dict(fn=fn, bytes=bytes, speed=speed)
out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1) out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
handler.comicDownloaded(self.moduleName, fn) handler.comicDownloaded(self.name, fn)
self.urlobj.close()
finally: finally:
try: self.urlobj.close()
out.write('Removing temporary file %s...' % (tmpFn,), 3)
os.remove(tmpFn)
except:
pass
return fn, True return fn, True

View file

@ -9,11 +9,13 @@ AppName = configdata.name
App = AppName+u" "+Version App = AppName+u" "+Version
Author = configdata.author Author = configdata.author
HtmlAuthor = Author.replace(u' ', u'&nbsp;') HtmlAuthor = Author.replace(u' ', u'&nbsp;')
Copyright = u"Copyright (C) 2004-2008 "+Author Maintainer = configdata.maintainer
HtmlCopyright = u"Copyright &copy; 2004-2008 "+HtmlAuthor HtmlMaintainer = Maintainer.replace(u' ', u'&nbsp;')
Copyright = u"Copyright (C) 2004-2008 "+Author+u", (C) 2012 "+Maintainer
HtmlCopyright = u"Copyright &copy; 2004-2008 "+HtmlAuthor+u", 2012 "+HtmlMaintainer
Url = configdata.url Url = configdata.url
SupportUrl = Url + u"/issues" SupportUrl = Url + u"/issues"
Email = configdata.author_email Email = configdata.maintainer_email
UserAgent = u"Mozilla/5.0 (compatible; %s/%s; +%s)" % (AppName, Version, Url) UserAgent = u"Mozilla/5.0 (compatible; %s/%s; +%s)" % (AppName, Version, Url)
Freeware = AppName+u""" comes with ABSOLUTELY NO WARRANTY! Freeware = AppName+u""" comes with ABSOLUTELY NO WARRANTY!
This is free software, and you are welcome to redistribute it This is free software, and you are welcome to redistribute it

View file

@ -2,112 +2,10 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
import re import re
import urlparse
from .util import fetchUrl, fetchManyUrls, getQueryParams from .util import fetchUrl, getQueryParams
from .comic import Comic from .scraper import _BasicScraper
class _BasicScraper(object):
'''Base class with scrape functions for comics.
@type latestUrl: C{string}
@cvar latestUrl: The URL for the latest comic strip.
@type imageUrl: C{string}
@cvar imageUrl: A string that is interpolated with the strip index
to yield the URL for a particular strip.
@type imageSearch: C{regex}
@cvar imageSearch: A compiled regex that will locate the strip image URL
when applied to the strip page.
@type prevSearch: C{regex}
@cvar prevSearch: A compiled regex that will locate the URL for the
previous strip when applied to a strip page.
'''
referrer = None
help = 'Sorry, no help for this comic yet.'
def __init__(self):
"""Initialize internal variables."""
self.currentUrl = None
self.urls = set()
def getReferrer(self, imageUrl, pageUrl):
"""Return referrer for HTTP connection."""
return self.referrer or pageUrl or self.getLatestUrl()
def getComic(self, url, pageUrl):
"""Get comic downloader for given URL and page."""
if not url:
return None
return Comic(self.get_name(), url, filename=self.getFilename(url, pageUrl), referrer=self.getReferrer(url, pageUrl))
def getCurrentComics(self):
"""Get list of current comics."""
self.currentUrl = self.getLatestUrl()
comics = self.getNextComics()
if not comics:
raise ValueError("Could not find current comic.")
return comics
def getNextComics(self):
"""Get all next comics."""
comics = []
while not comics and self.currentUrl and self.currentUrl not in self.urls:
comicUrlGroups, prevUrl = fetchManyUrls(self.currentUrl, [self.imageSearch, self.prevSearch])
if prevUrl:
prevUrl = prevUrl[0]
else:
prevUrl = None
for comicUrl in comicUrlGroups:
comics.append(self.getComic(comicUrl, self.currentUrl))
self.urls.update([self.currentUrl])
self.currentUrl = (prevUrl, None)[prevUrl in self.urls]
return comics
def setStrip(self, index):
"""Set current comic strip URL."""
self.currentUrl = self.imageUrl % index
def getHelp(self):
"""Return help text for this scraper."""
return self.help
def __iter__(self):
"""Iterate through the strips, starting from the current one and going backward."""
if not self.currentUrl:
self.currentUrl = self.getLatestUrl()
comics = True
while comics:
comics = self.getNextComics()
if comics:
yield comics
@classmethod
def get_name(cls):
"""Get scraper name."""
if hasattr(cls, 'name'):
return cls.name
return cls.__name__
@classmethod
def starter(cls):
"""Get starter URL from where to scrape comic strips."""
return cls.latestUrl
@classmethod
def namer(cls, imageUrl, pageUrl):
"""Return filename for given image and page URL."""
return None
def getFilename(self, imageUrl, pageUrl):
"""Return filename for given image and page URL."""
return self.namer(imageUrl, pageUrl)
def getLatestUrl(self):
"""Get starter URL from where to scrape comic strips."""
return self.starter()
def queryNamer(paramName, usePageUrl=False): def queryNamer(paramName, usePageUrl=False):
"""Get name from URL query part.""" """Get name from URL query part."""

74
dosagelib/loader.py Normal file
View file

@ -0,0 +1,74 @@
# -*- coding: iso-8859-1 -*-
import os
import sys
def get_modules(folder, importprefix):
"""Find all valid modules in the plugins directory. A valid module
must have a .py extension, and is importable.
@return: all loaded valid modules
@rtype: iterator of module
"""
for filename in get_importable_modules(folder):
try:
module = load_module(filename, importprefix)
if module is not None:
yield module
except StandardError, msg:
print "ERROR: could not load module %s: %s" % (filename, msg)
def get_importable_modules(folder):
"""Find all module files in the given folder that end witn '.py' and
don't start with an underscore.
@return module filenames
@rtype: iterator of string
"""
for fname in os.listdir(folder):
if fname.endswith('.py') and not fname.startswith('_'):
yield os.path.join(folder, fname)
def load_module(filename, importprefix):
"""Load and return the module given by the filename.
Other exceptions than ImportError are not catched.
@return: loaded module or None on import errors
@rtype: module or None
"""
name = os.path.splitext(os.path.basename(filename))[0]
modulename = "%s%s" % (importprefix, name)
__import__(modulename)
return sys.modules[modulename]
def get_plugins(modules, classobj):
"""Find all scrapers in all modules.
@param modules: the modules to search
@ptype modules: iterator of modules
@return: found scrapers
@rytpe: iterator of class objects
"""
for module in modules:
for plugin in get_module_plugins(module, classobj):
yield plugin
def get_module_plugins(module, classobj):
"""Return all subclasses of _BasicScraper in the module.
If the module defines __all__, only those entries will be searched,
otherwise all objects not starting with '_' will be searched.
"""
try:
names = module.__all__
except AttributeError:
names = [x for x in vars(module) if not x.startswith('_')]
for name in names:
try:
obj = getattr(module, name)
except AttributeError:
continue
try:
if issubclass(obj, classobj):
yield obj
except TypeError:
continue

View file

@ -2,14 +2,14 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, MULTILINE from re import compile, MULTILINE
from ..util import tagre from ..util import tagre
from ..scraper import _BasicScraper
from ..helpers import _BasicScraper, regexNamer, bounceStarter, indirectStarter from ..helpers import regexNamer, bounceStarter, indirectStarter
class ALessonIsLearned(_BasicScraper): class ALessonIsLearned(_BasicScraper):
latestUrl = 'http://www.alessonislearned.com/' latestUrl = 'http://www.alessonislearned.com/'
imageUrl = 'http://www.alessonislearned.com/lesson%s.html' imageUrl = 'http://www.alessonislearned.com/lesson%s.html'
imageSearch = compile(tagre("img", "src", r"(cmx/.+?)")) imageSearch = compile(tagre("img", "src", r"(cmx/lesson.+?)"))
prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=.+?)")+r".+?previous") prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=.+?)")+r".+?previous")
help = 'Index format: nnn' help = 'Index format: nnn'

View file

@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile from re import compile
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
class BadlyDrawnKitties(_BasicScraper): class BadlyDrawnKitties(_BasicScraper):

View file

@ -2,18 +2,23 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile from re import compile
from ..helpers import ( from ..scraper import _BasicScraper
_BasicScraper, constStarter, bounceStarter, indirectStarter) from ..helpers import constStarter, bounceStarter, indirectStarter
from ..util import getQueryParams from ..util import tagre, getQueryParams
class CalvinAndHobbes(_BasicScraper): class CalvinAndHobbes(_BasicScraper):
latestUrl = 'http://www.gocomics.com/calvinandhobbes/' starter = bounceStarter('http://www.gocomics.com/calvinandhobbes/',
compile(tagre("a", "href", "(/calvinandhobbes/\d+/\d+/\d+)")+"Next feature</a>"))
imageUrl = 'http://www.gocomics.com/calvinandhobbes/%s' imageUrl = 'http://www.gocomics.com/calvinandhobbes/%s'
imageSearch = compile(r'src="(http://picayune\.uclick\.com/comics/ch/[^"]+\.gif)"') imageSearch = compile(tagre("img", "src", "(http://assets\.amuniversal\.com/[a-f0-9]+)"))
prevSearch = compile(r'href="(.*?)"\s+onclick="[^"]*">Previous day</a>') prevSearch = compile(tagre("a", "href", "(/calvinandhobbes/\d+/\d+/\d+)")+"Previous feature</a>")
help = 'Index format: yyyy/mm/dd' help = 'Index format: yyyy/mm/dd'
@classmethod
def namer(cls, imageUrl, pageUrl):
prefix, year, month, day = pageUrl.rsplit('/', 3)
return "%s%s%s.gif" % (year, month, day)
class CandyCartoon(_BasicScraper): class CandyCartoon(_BasicScraper):

View file

@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE, MULTILINE from re import compile, IGNORECASE, MULTILINE
from ..helpers import _BasicScraper, bounceStarter, indirectStarter from ..scraper import _BasicScraper
from ..helpers import bounceStarter, indirectStarter
from ..util import getQueryParams from ..util import getQueryParams

View file

@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper, bounceStarter, queryNamer from ..scraper import _BasicScraper
from ..helpers import bounceStarter, queryNamer
def drunkDuck(shortName): def drunkDuck(shortName):

View file

@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper, indirectStarter from ..helpers import indirectStarter
from ..scraper import _BasicScraper
class EerieCuties(_BasicScraper): class EerieCuties(_BasicScraper):

View file

@ -1,9 +1,10 @@
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE, MULTILINE from re import compile, IGNORECASE, MULTILINE
from ..util import tagre
from ..helpers import _BasicScraper, indirectStarter from ..util import tagre
from ..scraper import _BasicScraper
from ..helpers import indirectStarter
class FalconTwin(_BasicScraper): class FalconTwin(_BasicScraper):

View file

@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile from re import compile
from ..helpers import _BasicScraper, indirectStarter from ..scraper import _BasicScraper
from ..helpers import indirectStarter
class Galaxion(_BasicScraper): class Galaxion(_BasicScraper):

View file

@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile from re import compile
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
class HappyMedium(_BasicScraper): class HappyMedium(_BasicScraper):

View file

@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
class IDreamOfAJeanieBottle(_BasicScraper): class IDreamOfAJeanieBottle(_BasicScraper):

View file

@ -2,8 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, MULTILINE from re import compile, MULTILINE
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
class Jack(_BasicScraper): class Jack(_BasicScraper):

View file

@ -2,8 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
class KernelPanic(_BasicScraper): class KernelPanic(_BasicScraper):

View file

@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
def keenSpot(comics): def keenSpot(comics):

View file

@ -2,8 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile from re import compile
from ..helpers import _BasicScraper, indirectStarter from ..scraper import _BasicScraper
from ..helpers import indirectStarter
class LasLindas(_BasicScraper): class LasLindas(_BasicScraper):

View file

@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper, queryNamer from ..scraper import _BasicScraper
from ..helpers import queryNamer
class MadamAndEve(_BasicScraper): class MadamAndEve(_BasicScraper):

View file

@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper, indirectStarter, _PHPScraper from ..scraper import _BasicScraper
from ..helpers import indirectStarter, _PHPScraper

View file

@ -2,8 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile from re import compile
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
class NineteenNinetySeven(_BasicScraper): class NineteenNinetySeven(_BasicScraper):

View file

@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper, indirectStarter from ..scraper import _BasicScraper
from ..helpers import indirectStarter
class OctopusPie(_BasicScraper): class OctopusPie(_BasicScraper):

View file

@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper, bounceStarter, queryNamer from ..scraper import _BasicScraper
from ..helpers import bounceStarter, queryNamer
class PartiallyClips(_BasicScraper): class PartiallyClips(_BasicScraper):

View file

@ -2,8 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile from re import compile
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
class QuestionableContent(_BasicScraper): class QuestionableContent(_BasicScraper):

View file

@ -2,8 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile from re import compile
from ..helpers import _BasicScraper, bounceStarter from ..scraper import _BasicScraper
from ..helpers import bounceStarter
class RadioactivePanda(_BasicScraper): class RadioactivePanda(_BasicScraper):

View file

@ -3,7 +3,8 @@
from re import compile, MULTILINE, IGNORECASE, sub from re import compile, MULTILINE, IGNORECASE, sub
from os.path import splitext from os.path import splitext
from ..helpers import _BasicScraper, bounceStarter, indirectStarter from ..scraper import _BasicScraper
from ..helpers import bounceStarter, indirectStarter
class SailorsunOrg(_BasicScraper): class SailorsunOrg(_BasicScraper):

View file

@ -2,8 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper, indirectStarter from ..scraper import _BasicScraper
from ..helpers import indirectStarter
class TalesOfPylea(_BasicScraper): class TalesOfPylea(_BasicScraper):

View file

@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import _BasicScraper, bounceStarter, indirectStarter from ..scraper import _BasicScraper
from ..helpers import bounceStarter, indirectStarter
from ..util import getQueryParams from ..util import getQueryParams

View file

@ -2,8 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE, sub from re import compile, IGNORECASE, sub
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
from ..util import fetchManyMatches, fetchUrl from ..util import fetchUrl
class _UClickScraper(_BasicScraper): class _UClickScraper(_BasicScraper):
@ -24,6 +24,7 @@ class _UClickScraper(_BasicScraper):
'index', 'index',
) )
# XXX refactor this mess
submoduleSearch = compile(r'(<A HREF="http://content.uclick.com/content/\w+.html">[^>]+?</a>)', IGNORECASE) submoduleSearch = compile(r'(<A HREF="http://content.uclick.com/content/\w+.html">[^>]+?</a>)', IGNORECASE)
partsMatch = compile(r'<A HREF="http://content.uclick.com/content/(\w+?).html">([^>]+?)</a>', IGNORECASE) partsMatch = compile(r'<A HREF="http://content.uclick.com/content/(\w+?).html">([^>]+?)</a>', IGNORECASE)
matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0] matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]

View file

@ -2,8 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE, MULTILINE from re import compile, IGNORECASE, MULTILINE
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
class _VGCats(_BasicScraper): class _VGCats(_BasicScraper):

View file

@ -2,7 +2,8 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, IGNORECASE, DOTALL from re import compile, IGNORECASE, DOTALL
from ..helpers import _BasicScraper, queryNamer, bounceStarter from ..scraper import _BasicScraper
from ..helpers import queryNamer, bounceStarter
class WayfarersMoon(_BasicScraper): class WayfarersMoon(_BasicScraper):

View file

@ -2,7 +2,9 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile from re import compile
from ..helpers import _BasicScraper, bounceStarter from ..scraper import _BasicScraper
from ..helpers import bounceStarter
class xkcd(_BasicScraper): class xkcd(_BasicScraper):
starter = bounceStarter('http://xkcd.com/', compile(r'<a rel="next" href="(/?\d+/?)"[^>]*>Next')) starter = bounceStarter('http://xkcd.com/', compile(r'<a rel="next" href="(/?\d+/?)"[^>]*>Next'))

View file

@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile, MULTILINE from re import compile, MULTILINE
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
class YAFGC(_BasicScraper): class YAFGC(_BasicScraper):

View file

@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from re import compile from re import compile
from ..helpers import _BasicScraper from ..scraper import _BasicScraper
class Zapiro(_BasicScraper): class Zapiro(_BasicScraper):

View file

@ -2,15 +2,14 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
import os import os
import sys from . import loader
from .util import fetchUrls
from .helpers import _BasicScraper from .comic import ComicStrip
disabled = [] disabled = []
def init_disabled(): def init_disabled():
filename = os.path.expanduser('~/.dosage/disabled') filename = os.path.expanduser('~/.dosage/disabled')
if not os.path.isfile(filename): if os.path.isfile(filename):
return
with open(filename) as f: with open(filename) as f:
for line in f: for line in f:
if line and not line.startswith('#'): if line and not line.startswith('#'):
@ -21,28 +20,104 @@ class DisabledComicError(ValueError):
pass pass
def get(comicName): class _BasicScraper(object):
'''Base class with scrape functions for comics.
@type latestUrl: C{string}
@cvar latestUrl: The URL for the latest comic strip.
@type imageUrl: C{string}
@cvar imageUrl: A string that is interpolated with the strip index
to yield the URL for a particular strip.
@type imageSearch: C{regex}
@cvar imageSearch: A compiled regex that will locate the strip image URL
when applied to the strip page.
@type prevSearch: C{regex}
@cvar prevSearch: A compiled regex that will locate the URL for the
previous strip when applied to a strip page.
'''
help = 'Sorry, no help for this comic yet.'
def __init__(self, indices=None):
"""Initialize internal variables."""
self.urls = set()
self.indices = indices
def getCurrentStrip(self):
"""Get current comic strip."""
return self.getStrip(self.getLatestUrl())
def getStrip(self, url):
"""Get comic strip for given URL."""
imageUrls = fetchUrls(url, self.imageSearch)
return self.getComicStrip(url, imageUrls)
def getComicStrip(self, url, imageUrls):
"""Get comic strip downloader for given URL and images."""
return ComicStrip(self.get_name(), url, imageUrls, self.namer)
def getAllStrips(self):
"""Get all comic strips."""
seen_urls = set()
url = self.getLatestUrl()
while url:
imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
seen_urls.add(url)
yield self.getComicStrip(url, imageUrls)
# avoid recursive URL loops
url = prevUrl if prevUrl not in seen_urls else None
def setStrip(self, index):
"""Set current comic strip URL."""
self.currentUrl = self.imageUrl % index
def getHelp(self):
"""Return help text for this scraper."""
return self.help
@classmethod
def get_name(cls):
"""Get scraper name."""
if hasattr(cls, 'name'):
return cls.name
return cls.__name__
@classmethod
def starter(cls):
"""Get starter URL from where to scrape comic strips."""
return cls.latestUrl
@classmethod
def namer(cls, imageUrl, pageUrl):
"""Return filename for given image and page URL."""
return None
def getFilename(self, imageUrl, pageUrl):
"""Return filename for given image and page URL."""
return self.namer(imageUrl, pageUrl)
def getLatestUrl(self):
"""Get starter URL from where to scrape comic strips."""
return self.starter()
def get_scraper(comic):
"""Returns a comic module object.""" """Returns a comic module object."""
candidates = [] candidates = []
for scraper in get_scrapers(): cname = comic.lower()
lname = scraper.get_name().lower() for scraperclass in get_scrapers():
cname = comicName.lower() lname = scraperclass.get_name().lower()
if lname == cname: if lname == cname:
# perfect match # perfect match
return scraper return scraperclass
if cname in lname: if cname in lname:
candidates.append(scraper) candidates.append(scraperclass)
if len(candidates) == 1: if len(candidates) == 1:
return candidates[0] return candidates[0]
elif candidates: elif candidates:
comics = ", ".join(x.get_name() for x in candidates) comics = ", ".join(x.get_name() for x in candidates)
raise ValueError('Multiple comics %s found.' % comics) raise ValueError('Multiple comics %s found.' % comics)
else: else:
raise ValueError('Comic %r not found.' % comicName) raise ValueError('Comic %r not found.' % comic)
def items():
return get_scrapers()
_scrapers = None _scrapers = None
@ -54,91 +129,23 @@ def get_scrapers():
""" """
global _scrapers global _scrapers
if _scrapers is None: if _scrapers is None:
_scrapers = list(get_all_plugins(get_modules())) folder = os.path.join(os.path.dirname(__file__), 'plugins')
importprefix = 'dosagelib.plugins.'
modules = loader.get_modules(folder, importprefix)
plugins = loader.get_plugins(modules, _BasicScraper)
_scrapers = list(plugins)
_scrapers.sort(key=lambda s: s.get_name()) _scrapers.sort(key=lambda s: s.get_name())
check_scrapers() check_scrapers()
return _scrapers return _scrapers
def check_scrapers(): def check_scrapers():
"""Check for duplicate scraper class names."""
d = {} d = {}
for s in _scrapers: for scraperclass in _scrapers:
name = s.get_name().lower() name = scraperclass.get_name().lower()
if name in d: if name in d:
name1 = s.get_name() name1 = scraperclass.get_name()
name2 = d[name].get_name() name2 = d[name].get_name()
raise ValueError('Duplicate scrapers %s and %s found' % (name1, name2)) raise ValueError('Duplicate scrapers %s and %s found' % (name1, name2))
d[name] = s d[name] = scraperclass
def get_modules():
"""Find all valid modules in the plugins directory. A valid module
must have a .py extension, and is importable.
@return: all loaded valid modules
@rtype: iterator of module
"""
# load from the plugins folder
folder = os.path.join(os.path.dirname(__file__), 'plugins')
for filename in get_importable_modules(folder):
try:
module = load_module(filename)
if module is not None:
yield module
except StandardError, msg:
print "ERROR", msg
def get_importable_modules(folder):
"""Find all module files in the given folder that end witn '.py' and
don't start with an underscore.
@return module filenames
@rtype: iterator of string
"""
for fname in os.listdir(folder):
if fname.endswith('.py') and not fname.startswith('_'):
yield os.path.join(folder, fname)
def load_module(filename):
"""Load and return the module given by the filename.
Other exceptions than ImportError are not catched.
@return: loaded module or None on import errors
@rtype: module or None
"""
name = os.path.splitext(os.path.basename(filename))[0]
modulename = "dosagelib.plugins.%s" % name
__import__(modulename)
return sys.modules[modulename]
def get_all_plugins(modules):
"""Find all scrapers in all modules.
@param modules: the modules to search
@ptype modules: iterator of modules
@return: found scrapers
@rytpe: iterator of class objects
"""
for module in modules:
for plugin in get_plugins(module):
yield plugin
def get_plugins(module):
"""Return all subclasses of _BasicScraper in the module.
If the module defines __all__, only those entries will be searched,
otherwise all objects not starting with '_' will be searched.
"""
try:
names = module.__all__
except AttributeError:
names = [x for x in vars(module) if not x.startswith('_')]
for name in names:
try:
obj = getattr(module, name)
except AttributeError:
continue
try:
if issubclass(obj, _BasicScraper):
yield obj
except TypeError:
continue

View file

@ -21,72 +21,90 @@ if os.name == 'nt':
has_curses = has_module("curses") has_curses = has_module("curses")
class NoMatchError(Exception): MAX_FILESIZE = 1024*1024*1 # 1MB
pass
def getMatchValues(matches): def tagre(tag, attribute, value):
return set([match.group(1) for match in matches]) """Return a regular expression matching the given HTML tag, attribute
and value. It matches the tag and attribute names case insensitive,
and skips arbitrary whitespace and leading HTML attributes. The "<>" at
the start and end of the HTML tag is also matched.
@param tag: the tag name
@ptype tag: string
@param attribute: the attribute name
@ptype attribute: string
@param value: the attribute value
@ptype value: string
@return: the generated regular expression suitable for re.compile()
@rtype: string
"""
attrs = dict(
tag=case_insensitive_re(tag),
attribute=case_insensitive_re(attribute),
value=value,
)
return r'<\s*%(tag)s[^>]*\s+%(attribute)s\s*=\s*"%(value)s"[^>]*/?>' % attrs
def fetchManyMatches(url, regexes):
'''Returns a list containing lists of matches for each regular expression, in the same order.''' def case_insensitive_re(name):
out.write('Matching regex(es) %r multiple times against %s...' % ([rex.pattern for rex in regexes], url), 2) """Reformat the given name to a case insensitive regular expression string
without using re.IGNORECASE. This way selective strings can be made case
insensitive.
@param name: the name to make case insensitive
@ptype name: string
@return: the case insenstive regex
@rtype: string
"""
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
def getPageContent(url):
# read page data
page = urlopen(url) page = urlopen(url)
data = page.read() data = page.read(MAX_FILESIZE)
# determine base URL
matches = [getMatchValues(regex.finditer(data)) for regex in regexes] baseUrl = None
if matches: match = baseSearch.search(data)
out.write('...found %r' % (matches,), 2)
else:
out.write('...not found!', 2)
return list(matches)
def fetchMatches(url, regexes):
out.write('Matching regex(es) %r against %s...' % ([rex.pattern for rex in regexes], url), 2)
page = urlopen(url)
data = page.read()
matches = []
for regex in regexes:
match = regex.search(data)
if match: if match:
matches.append(match.group(1)) baseUrl = match.group(1)
if matches:
out.write('...found %r' % (matches,), 2)
else: else:
out.write('...not found!', 2) baseUrl = url
return data, baseUrl
return matches
def fetchMatch(url, regex): def fetchUrl(url, searchRo):
matches = fetchMatches(url, (regex,)) data, baseUrl = getPageContent(url)
if matches: match = searchRo.search(data)
return matches[0] if match:
searchUrl = match.group(1)
out.write('matched URL %r' % searchUrl, 2)
return urlparse.urljoin(baseUrl, searchUrl)
return None return None
def fetchUrl(url, regex):
match = fetchMatch(url, regex) def fetchUrls(url, imageSearch, prevSearch=None):
data, baseUrl = getPageContent(url)
# match images
imageUrls = set()
for match in imageSearch.finditer(data):
imageUrl = match.group(1)
out.write('matched image URL %r' % imageUrl, 2)
imageUrls.add(urlparse.urljoin(baseUrl, imageUrl))
if not imageUrls:
raise ValueError("No images found at %s with pattern %s" % (url, imageSearch.pattern))
if prevSearch is not None:
# match previous URL
match = prevSearch.search(data)
if match: if match:
return urlparse.urljoin(url, match) prevUrl = match.group(1)
return None out.write('matched previous URL %r' % prevUrl, 2)
prevUrl = urlparse.urljoin(baseUrl, prevUrl)
else:
prevUrl = None
return imageUrls, prevUrl
return imageUrls
baseSearch = re.compile(r'<base\s+href="([^"]*)"\s+/?>', re.IGNORECASE)
def fetchUrls(url, regexes):
matches = fetchMatches(url, [baseSearch] + list(regexes))
baseUrl = matches.pop(0) or url
return [urlparse.urljoin(baseUrl, match) for match in matches]
def fetchManyUrls(url, regexes):
matchGroups = fetchManyMatches(url, [baseSearch] + list(regexes))
baseUrl = matchGroups.pop(0) or [url]
baseUrl = baseUrl[0]
xformedGroups = []
for matchGroup in matchGroups:
xformedGroups.append([urlparse.urljoin(baseUrl, match) for match in matchGroup])
return xformedGroups
def _unescape(text): def _unescape(text):
""" """
@ -278,37 +296,3 @@ def strtimezone():
else: else:
zone = time.timezone zone = time.timezone
return "%+04d" % (-zone//3600) return "%+04d" % (-zone//3600)
def tagre(tag, attribute, value):
"""Return a regular expression matching the given HTML tag, attribute
and value. It matches the tag and attribute names case insensitive,
and skips arbitrary whitespace and leading HTML attributes. The "<>" at
the start and end of the HTML tag is also matched.
@param tag: the tag name
@ptype tag: string
@param attribute: the attribute name
@ptype attribute: string
@param value: the attribute value
@ptype value: string
@return: the generated regular expression suitable for re.compile()
@rtype: string
"""
attrs = dict(
tag=case_insensitive_re(tag),
attribute=case_insensitive_re(attribute),
value=value,
)
return r'<\s*%(tag)s[^>]*\s+%(attribute)s\s*=\s*"%(value)s"[^>]>' % attrs
def case_insensitive_re(name):
"""Reformat the given name to a case insensitive regular expression string
without using re.IGNORECASE. This way selective strings can be made case
insensitive.
@param name: the name to make case insensitive
@ptype name: string
@return: the case insenstive regex
@rtype: string
"""
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)