A lot of refactoring.
This commit is contained in:
parent
4ba973abf5
commit
c707aa893d
37 changed files with 472 additions and 551 deletions
240
dosage
240
dosage
|
@ -20,7 +20,6 @@
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import optparse
|
import optparse
|
||||||
import traceback
|
|
||||||
|
|
||||||
from dosagelib import events, scraper
|
from dosagelib import events, scraper
|
||||||
from dosagelib.output import out
|
from dosagelib.output import out
|
||||||
|
@ -35,13 +34,13 @@ def setupOptions():
|
||||||
usage = 'usage: %prog [options] comicModule [comicModule ...]'
|
usage = 'usage: %prog [options] comicModule [comicModule ...]'
|
||||||
parser = optparse.OptionParser(usage=usage)
|
parser = optparse.OptionParser(usage=usage)
|
||||||
parser.add_option('-v', '--verbose', action='count', dest='verbose', default=0, help='provides verbose output, use multiple times for more verbosity')
|
parser.add_option('-v', '--verbose', action='count', dest='verbose', default=0, help='provides verbose output, use multiple times for more verbosity')
|
||||||
parser.add_option('-c', '--catch-up', action='count', dest='catchup', default=None, help='traverse and retrieve all available comics up until the strip that already exists locally, use twice to retrieve until all strips exist locally')
|
parser.add_option('-c', '--catchup', action='count', dest='catchup', default=None, help='traverse and retrieve all available comics up until the strip that already exists locally, use twice to retrieve until all strips exist locally')
|
||||||
parser.add_option('-b', '--base-path', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH')
|
parser.add_option('-b', '--basepath', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH')
|
||||||
parser.add_option('--base-url', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH')
|
parser.add_option('--baseurl', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH')
|
||||||
parser.add_option('-l', '--list', action='store_const', const=1, dest='list', help='list available comic modules')
|
parser.add_option('-l', '--list', action='store_const', const=1, dest='list', help='list available comic modules')
|
||||||
parser.add_option('--single-list', action='store_const', const=2, dest='list', help='list available comic modules in a single list')
|
parser.add_option('--singlelist', action='store_const', const=2, dest='list', help='list available comic modules in a single list')
|
||||||
parser.add_option('-V', '--version', action='store_true', dest='version', help='display the version number')
|
parser.add_option('-V', '--version', action='store_true', dest='version', help='display the version number')
|
||||||
parser.add_option('-m', '--module-help', action='store_true', dest='modhelp', help='display help for comic modules')
|
parser.add_option('-m', '--modulehelp', action='store_true', dest='modhelp', help='display help for comic modules')
|
||||||
parser.add_option('-t', '--timestamps', action='store_true', dest='timestamps', default=False, help='print timestamps for all output at any info level')
|
parser.add_option('-t', '--timestamps', action='store_true', dest='timestamps', default=False, help='print timestamps for all output at any info level')
|
||||||
parser.add_option('-o', '--output', action='store', dest='output', choices=events.getHandlers(), help='output formatting for downloaded comics')
|
parser.add_option('-o', '--output', action='store', dest='output', choices=events.getHandlers(), help='output formatting for downloaded comics')
|
||||||
if is_tty(sys.stdout):
|
if is_tty(sys.stdout):
|
||||||
|
@ -54,196 +53,129 @@ def displayVersion():
|
||||||
print App
|
print App
|
||||||
print Copyright
|
print Copyright
|
||||||
print Freeware
|
print Freeware
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
class Dosage(object):
|
def setOutputInfo(options):
|
||||||
"""Main program executing comic commands."""
|
|
||||||
|
|
||||||
def __init__(self, settings):
|
|
||||||
"""Store settings and initialize internal variables."""
|
|
||||||
self.settings = settings
|
|
||||||
self.errors = 0
|
|
||||||
|
|
||||||
def setOutputInfo(self):
|
|
||||||
"""Set global output level and timestamp option."""
|
"""Set global output level and timestamp option."""
|
||||||
out.level = 0
|
out.level = 0
|
||||||
out.level += self.settings['verbose']
|
out.level += options.verbose
|
||||||
out.timestamps = self.settings['timestamps']
|
out.timestamps = options.timestamps
|
||||||
|
|
||||||
def saveComic(self, comic):
|
|
||||||
"""Save one comic strip in an output file."""
|
|
||||||
basepath = self.settings['basepath']
|
|
||||||
progress = self.settings.get('progress', False)
|
|
||||||
fn, saved = comic.save(basepath, progress)
|
|
||||||
return saved
|
|
||||||
|
|
||||||
def saveComics(self, comics):
|
def saveComicStrip(strip, basepath, progress):
|
||||||
"""Save a list of comics."""
|
"""Save a comic strip which can consist of multiple images."""
|
||||||
saved = False
|
errors = 0
|
||||||
for comic in comics:
|
for image in strip.getImages():
|
||||||
saved = self.saveComic(comic) or saved
|
|
||||||
return saved
|
|
||||||
|
|
||||||
def safeOp(self, fp, *args, **kwargs):
|
|
||||||
"""Run a function and catch and report any errors."""
|
|
||||||
try:
|
try:
|
||||||
fp(*args, **kwargs)
|
image.save(basepath, progress)
|
||||||
except Exception:
|
except IOError, msg:
|
||||||
self.errors += 1
|
out.write('Error saving %s: %s' % (image.filename, msg))
|
||||||
type, value, tb = sys.exc_info()
|
errors += 1
|
||||||
out.write('Traceback (most recent call last):', 1)
|
return errors
|
||||||
out.writelines(traceback.format_stack(), 1)
|
|
||||||
out.writelines(traceback.format_tb(tb)[1:], 1)
|
|
||||||
out.writelines(traceback.format_exception_only(type, value))
|
|
||||||
|
|
||||||
def getCurrent(self):
|
|
||||||
"""Retrieve and save all current comic strips."""
|
|
||||||
out.write('Retrieving the current strip...')
|
|
||||||
self.saveComics(self.module.getCurrentComics())
|
|
||||||
|
|
||||||
def getIndex(self, index):
|
def displayHelp(comics, basepath):
|
||||||
"""Retrieve comcis with given index."""
|
"""Print help for comic strips."""
|
||||||
out.write('Retrieving index "%s"....' % (index,))
|
for scraperobj in getScrapers(comics, basepath):
|
||||||
try:
|
for line in scraperobj.getHelp().splitlines():
|
||||||
self.module.setStrip(index)
|
out.write("Help: "+line)
|
||||||
self.saveComics(self.module.getNextComics())
|
return 0
|
||||||
except NotImplementedError:
|
|
||||||
out.write('No indexed retrieval support.')
|
|
||||||
|
|
||||||
def catchup(self):
|
def getComics(options, comics):
|
||||||
"""Save all comics until the current date."""
|
errors = 0
|
||||||
|
events.installHandler(options.output, options.basepath, options.baseurl)
|
||||||
|
events.handler.start()
|
||||||
|
for scraperobj in getScrapers(comics, options.basepath):
|
||||||
|
out.context = scraperobj.get_name()
|
||||||
|
if options.catchup:
|
||||||
out.write('Catching up...')
|
out.write('Catching up...')
|
||||||
for comics in self.module:
|
strips = scraperobj.getAllStrips()
|
||||||
if not self.saveComics(comics) and self.settings['catchup'] < 2:
|
else:
|
||||||
break
|
out.write('Retrieving the current strip...')
|
||||||
|
strips = [scraperobj.getCurrentStrip()]
|
||||||
|
for strip in strips:
|
||||||
|
errors += saveComicStrip(strip, options.basepath, options.progress)
|
||||||
|
events.handler.end()
|
||||||
|
return errors
|
||||||
|
|
||||||
def catchupIndex(self, index):
|
|
||||||
"""Retrieve and save all comics from the given index."""
|
|
||||||
out.write('Catching up from index "%s"...' % (index,))
|
|
||||||
self.module.setStrip(index)
|
|
||||||
for comics in self.module:
|
|
||||||
if not self.saveComics(comics) and self.settings['catchup'] < 2:
|
|
||||||
break
|
|
||||||
|
|
||||||
def getScrapers(self):
|
def run(options, comics):
|
||||||
"""Get list of scraper objects."""
|
"""Execute comic commands."""
|
||||||
return scraper.items()
|
setOutputInfo(options)
|
||||||
|
if options.version:
|
||||||
|
return displayVersion()
|
||||||
|
if options.list:
|
||||||
|
return doList(options.list == 1)
|
||||||
|
if len(comics) <= 0:
|
||||||
|
out.write('Warning: No comics specified, bailing out!')
|
||||||
|
return 1
|
||||||
|
if options.modhelp:
|
||||||
|
return displayHelp(comics, options.basepath)
|
||||||
|
errors = getComics(options, comics)
|
||||||
|
|
||||||
def getExistingComics(self):
|
|
||||||
"""Get all existing comic scrapers."""
|
|
||||||
for scraper in self.getScrapers():
|
|
||||||
dirname = scraper.get_name().replace('/', os.sep)
|
|
||||||
if os.path.isdir(os.path.join(self.settings['basepath'], dirname)):
|
|
||||||
yield scraper
|
|
||||||
|
|
||||||
def doList(self, columnList):
|
def doList(columnList):
|
||||||
"""List available comics."""
|
"""List available comics."""
|
||||||
out.write('Available comic scrapers:')
|
out.write('Available comic scrapers:')
|
||||||
scrapers = self.getScrapers()
|
scrapers = getScrapers(['@@'])
|
||||||
if len(scrapers) > 0:
|
|
||||||
if columnList:
|
if columnList:
|
||||||
self.doColumnList(scrapers)
|
doColumnList(scrapers)
|
||||||
else:
|
else:
|
||||||
self.doSingleList(scrapers)
|
doSingleList(scrapers)
|
||||||
out.write('%d supported comics.' % len(scrapers))
|
out.write('%d supported comics.' % len(scrapers))
|
||||||
|
return 0
|
||||||
|
|
||||||
def doSingleList(self, scrapers):
|
|
||||||
|
def doSingleList(scrapers):
|
||||||
"""Get list of scraper names, one per line."""
|
"""Get list of scraper names, one per line."""
|
||||||
print '\n'.join(scraper.get_name() for scraper in scrapers)
|
print '\n'.join(scraperobj.get_name() for scraperobj in scrapers)
|
||||||
|
|
||||||
def doColumnList(self, scrapers):
|
|
||||||
|
def doColumnList(scrapers):
|
||||||
"""Get list of scraper names with multiple names per line."""
|
"""Get list of scraper names with multiple names per line."""
|
||||||
screenWidth = get_columns()
|
screenWidth = get_columns()
|
||||||
names = [scraper.get_name() for scraper in scrapers]
|
names = [scraperobj.get_name() for scraperobj in scrapers]
|
||||||
maxlen = max([len(name) for name in names])
|
maxlen = max([len(name) for name in names])
|
||||||
namesPerLine = int(screenWidth / (maxlen + 1))
|
namesPerLine = int(screenWidth / (maxlen + 1))
|
||||||
while names:
|
while names:
|
||||||
print ''.join([name.ljust(maxlen) for name in names[:namesPerLine]])
|
print ''.join([name.ljust(maxlen) for name in names[:namesPerLine]])
|
||||||
del names[:namesPerLine]
|
del names[:namesPerLine]
|
||||||
|
|
||||||
def doCatchup(self):
|
|
||||||
"""Catchup comics."""
|
def getScrapers(comics, basepath=None):
|
||||||
for comic in self.useComics():
|
"""Get scraper objects for the given comics."""
|
||||||
if self.indices:
|
if '@' in comics:
|
||||||
self.safeOp(self.catchupIndex, self.indices[0])
|
# only scrapers whose directory already exists
|
||||||
|
if len(comics) > 1:
|
||||||
|
out.write("WARN: using '@' as comic name ignores all other specified comics.\n")
|
||||||
|
for scraperclass in scraper.get_scrapers():
|
||||||
|
dirname = scraperclass.get_name().replace('/', os.sep)
|
||||||
|
if os.path.isdir(os.path.join(basepath, dirname)):
|
||||||
|
yield scraperclass()
|
||||||
|
elif '@@' in comics:
|
||||||
|
# all scrapers
|
||||||
|
if len(comics) > 1:
|
||||||
|
out.write("WARN: using '@@' as comic name ignores all other specified comics.\n")
|
||||||
|
for scraperclass in scraper.get_scrapers():
|
||||||
|
yield scraperclass()
|
||||||
else:
|
else:
|
||||||
self.safeOp(self.catchup)
|
# only selected
|
||||||
|
for comic in comics:
|
||||||
def doCurrent(self):
|
|
||||||
"""Get current comics."""
|
|
||||||
for comic in self.useComics():
|
|
||||||
if self.indices:
|
|
||||||
for index in self.indices:
|
|
||||||
self.safeOp(self.getIndex, index)
|
|
||||||
else:
|
|
||||||
self.safeOp(self.getCurrent)
|
|
||||||
|
|
||||||
def doHelp(self):
|
|
||||||
"""Print help for comic strips."""
|
|
||||||
for scraper in self.useComics():
|
|
||||||
for line in scraper.getHelp().splitlines():
|
|
||||||
out.write("Help: "+line)
|
|
||||||
|
|
||||||
def setupComic(self, scraper):
|
|
||||||
"""Setup the internal comic module from given scraper."""
|
|
||||||
self.module = scraper()
|
|
||||||
out.context = scraper.get_name()
|
|
||||||
return self.module
|
|
||||||
|
|
||||||
def useComics(self):
|
|
||||||
"""Set all comic modules for the defined comics."""
|
|
||||||
for comic in self.comics:
|
|
||||||
c = comic.split(':', 2)
|
c = comic.split(':', 2)
|
||||||
if len(c) > 1:
|
if len(c) > 1:
|
||||||
self.indices = c[1].split(',')
|
indices = c[1].split(',')
|
||||||
else:
|
else:
|
||||||
self.indices = None
|
indices = None
|
||||||
|
|
||||||
moduleName = c[0]
|
moduleName = c[0]
|
||||||
if moduleName == '@':
|
yield scraper.get_scraper(moduleName)(indices=indices)
|
||||||
for s in self.getExistingComics():
|
|
||||||
yield self.setupComic(s)
|
|
||||||
elif moduleName == '@@':
|
|
||||||
for s in self.getScrapers():
|
|
||||||
yield self.setupComic(s)
|
|
||||||
else:
|
|
||||||
yield self.setupComic(scraper.get(moduleName))
|
|
||||||
|
|
||||||
def run(self, comics):
|
|
||||||
"""Execute comic commands."""
|
|
||||||
self.setOutputInfo()
|
|
||||||
self.comics = comics
|
|
||||||
|
|
||||||
om = self.settings['output']
|
|
||||||
events.installHandler(om, self.settings['basepath'], self.settings['baseurl'])
|
|
||||||
events.handler.start()
|
|
||||||
|
|
||||||
if self.settings['version']:
|
|
||||||
displayVersion()
|
|
||||||
elif self.settings['list']:
|
|
||||||
self.doList(self.settings['list'] == 1)
|
|
||||||
elif len(comics) <= 0:
|
|
||||||
out.write('Warning: No comics specified, bailing out!')
|
|
||||||
elif self.settings['modhelp']:
|
|
||||||
self.doHelp()
|
|
||||||
elif self.settings['catchup']:
|
|
||||||
self.doCatchup()
|
|
||||||
else:
|
|
||||||
self.doCurrent()
|
|
||||||
|
|
||||||
events.handler.end()
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Parse options and execute commands."""
|
"""Parse options and execute commands."""
|
||||||
try:
|
try:
|
||||||
parser = setupOptions()
|
parser = setupOptions()
|
||||||
options, args = parser.parse_args()
|
options, args = parser.parse_args()
|
||||||
d = Dosage(options.__dict__)
|
res = run(options, args)
|
||||||
d.run(args)
|
|
||||||
if d.errors:
|
|
||||||
res = 1
|
|
||||||
else:
|
|
||||||
res = 0
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print "Aborted."
|
print "Aborted."
|
||||||
res = 1
|
res = 1
|
||||||
|
|
|
@ -6,8 +6,6 @@ import locale
|
||||||
import rfc822
|
import rfc822
|
||||||
import time
|
import time
|
||||||
import shutil
|
import shutil
|
||||||
# XXX why is this done??
|
|
||||||
locale.setlocale(locale.LC_ALL, '')
|
|
||||||
|
|
||||||
from .output import out
|
from .output import out
|
||||||
from .util import urlopen, saneDataSize, normaliseURL
|
from .util import urlopen, saneDataSize, normaliseURL
|
||||||
|
@ -18,16 +16,34 @@ class FetchComicError(IOError):
|
||||||
"""Exception for comic fetching errors."""
|
"""Exception for comic fetching errors."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class Comic(object):
|
class ComicStrip(object):
|
||||||
"""Download and save a single comic."""
|
"""A list of comic image URLs."""
|
||||||
|
|
||||||
def __init__(self, moduleName, url, referrer=None, filename=None):
|
def __init__(self, name, parentUrl, imageUrls, namer):
|
||||||
|
"""Store the image URL list."""
|
||||||
|
self.name = name
|
||||||
|
self.parentUrl = parentUrl
|
||||||
|
self.imageUrls = imageUrls
|
||||||
|
self.namer = namer
|
||||||
|
|
||||||
|
def getImages(self):
|
||||||
|
"""Get a list of image downloaders."""
|
||||||
|
for imageUrl in self.imageUrls:
|
||||||
|
yield self.getDownloader(normaliseURL(imageUrl))
|
||||||
|
|
||||||
|
def getDownloader(self, url):
|
||||||
|
filename = self.namer(url, self.parentUrl)
|
||||||
|
return ComicImage(self.name, self.parentUrl, url, filename)
|
||||||
|
|
||||||
|
|
||||||
|
class ComicImage(object):
|
||||||
|
def __init__(self, name, referrer, url, filename):
|
||||||
"""Set URL and filename."""
|
"""Set URL and filename."""
|
||||||
self.moduleName = moduleName
|
self.name = name
|
||||||
self.url = normaliseURL(url)
|
|
||||||
self.referrer = referrer
|
self.referrer = referrer
|
||||||
|
self.url = url
|
||||||
if filename is None:
|
if filename is None:
|
||||||
filename = url.split('/')[-1]
|
filename = url.rsplit('/')[1]
|
||||||
self.filename, self.ext = os.path.splitext(filename)
|
self.filename, self.ext = os.path.splitext(filename)
|
||||||
self.filename = self.filename.replace(os.sep, '_')
|
self.filename = self.filename.replace(os.sep, '_')
|
||||||
self.ext = self.ext.replace(os.sep, '_')
|
self.ext = self.ext.replace(os.sep, '_')
|
||||||
|
@ -62,13 +78,13 @@ class Comic(object):
|
||||||
def save(self, basepath, showProgress=False):
|
def save(self, basepath, showProgress=False):
|
||||||
"""Save comic URL to filename on disk."""
|
"""Save comic URL to filename on disk."""
|
||||||
self.connect()
|
self.connect()
|
||||||
comicName, comicExt = self.filename, self.ext
|
filename = "%s%s" % (self.filename, self.ext)
|
||||||
comicSize = self.contentLength
|
comicSize = self.contentLength
|
||||||
comicDir = os.path.join(basepath, self.moduleName.replace('/', os.sep))
|
comicDir = os.path.join(basepath, self.name.replace('/', os.sep))
|
||||||
if not os.path.isdir(comicDir):
|
if not os.path.isdir(comicDir):
|
||||||
os.makedirs(comicDir)
|
os.makedirs(comicDir)
|
||||||
|
|
||||||
fn = os.path.join(comicDir, '%s%s' % (self.filename, self.ext))
|
fn = os.path.join(comicDir, filename)
|
||||||
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
|
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
|
||||||
self.urlobj.close()
|
self.urlobj.close()
|
||||||
self.touch(fn)
|
self.touch(fn)
|
||||||
|
@ -76,10 +92,8 @@ class Comic(object):
|
||||||
return fn, False
|
return fn, False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tmpFn = os.path.join(comicDir, '__%s%s' % (self.filename, self.ext))
|
out.write('Writing comic to file %s...' % (fn,), 3)
|
||||||
out.write('Writing comic to temporary file %s...' % (tmpFn,), 3)
|
with open(fn, 'wb') as comicOut:
|
||||||
comicOut = file(tmpFn, 'wb')
|
|
||||||
try:
|
|
||||||
startTime = time.time()
|
startTime = time.time()
|
||||||
if showProgress:
|
if showProgress:
|
||||||
def pollData():
|
def pollData():
|
||||||
|
@ -92,12 +106,12 @@ class Comic(object):
|
||||||
else:
|
else:
|
||||||
comicOut.write(self.urlobj.read())
|
comicOut.write(self.urlobj.read())
|
||||||
endTime = time.time()
|
endTime = time.time()
|
||||||
finally:
|
|
||||||
comicOut.close()
|
|
||||||
out.write('Copying temporary file (%s) to %s...' % (tmpFn, fn), 3)
|
|
||||||
shutil.copy2(tmpFn, fn)
|
|
||||||
self.touch(fn)
|
self.touch(fn)
|
||||||
|
except:
|
||||||
|
if os.path.isfile(fn):
|
||||||
|
os.remove(fn)
|
||||||
|
raise
|
||||||
|
else:
|
||||||
size = os.path.getsize(fn)
|
size = os.path.getsize(fn)
|
||||||
bytes = locale.format('%d', size, True)
|
bytes = locale.format('%d', size, True)
|
||||||
if endTime != startTime:
|
if endTime != startTime:
|
||||||
|
@ -106,13 +120,8 @@ class Comic(object):
|
||||||
speed = '???'
|
speed = '???'
|
||||||
attrs = dict(fn=fn, bytes=bytes, speed=speed)
|
attrs = dict(fn=fn, bytes=bytes, speed=speed)
|
||||||
out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
|
out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
|
||||||
handler.comicDownloaded(self.moduleName, fn)
|
handler.comicDownloaded(self.name, fn)
|
||||||
self.urlobj.close()
|
|
||||||
finally:
|
finally:
|
||||||
try:
|
self.urlobj.close()
|
||||||
out.write('Removing temporary file %s...' % (tmpFn,), 3)
|
|
||||||
os.remove(tmpFn)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return fn, True
|
return fn, True
|
||||||
|
|
|
@ -9,11 +9,13 @@ AppName = configdata.name
|
||||||
App = AppName+u" "+Version
|
App = AppName+u" "+Version
|
||||||
Author = configdata.author
|
Author = configdata.author
|
||||||
HtmlAuthor = Author.replace(u' ', u' ')
|
HtmlAuthor = Author.replace(u' ', u' ')
|
||||||
Copyright = u"Copyright (C) 2004-2008 "+Author
|
Maintainer = configdata.maintainer
|
||||||
HtmlCopyright = u"Copyright © 2004-2008 "+HtmlAuthor
|
HtmlMaintainer = Maintainer.replace(u' ', u' ')
|
||||||
|
Copyright = u"Copyright (C) 2004-2008 "+Author+u", (C) 2012 "+Maintainer
|
||||||
|
HtmlCopyright = u"Copyright © 2004-2008 "+HtmlAuthor+u", 2012 "+HtmlMaintainer
|
||||||
Url = configdata.url
|
Url = configdata.url
|
||||||
SupportUrl = Url + u"/issues"
|
SupportUrl = Url + u"/issues"
|
||||||
Email = configdata.author_email
|
Email = configdata.maintainer_email
|
||||||
UserAgent = u"Mozilla/5.0 (compatible; %s/%s; +%s)" % (AppName, Version, Url)
|
UserAgent = u"Mozilla/5.0 (compatible; %s/%s; +%s)" % (AppName, Version, Url)
|
||||||
Freeware = AppName+u""" comes with ABSOLUTELY NO WARRANTY!
|
Freeware = AppName+u""" comes with ABSOLUTELY NO WARRANTY!
|
||||||
This is free software, and you are welcome to redistribute it
|
This is free software, and you are welcome to redistribute it
|
||||||
|
|
|
@ -2,112 +2,10 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012 Bastian Kleineidam
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
import re
|
import re
|
||||||
|
import urlparse
|
||||||
|
|
||||||
from .util import fetchUrl, fetchManyUrls, getQueryParams
|
from .util import fetchUrl, getQueryParams
|
||||||
from .comic import Comic
|
from .scraper import _BasicScraper
|
||||||
|
|
||||||
class _BasicScraper(object):
|
|
||||||
'''Base class with scrape functions for comics.
|
|
||||||
|
|
||||||
@type latestUrl: C{string}
|
|
||||||
@cvar latestUrl: The URL for the latest comic strip.
|
|
||||||
@type imageUrl: C{string}
|
|
||||||
@cvar imageUrl: A string that is interpolated with the strip index
|
|
||||||
to yield the URL for a particular strip.
|
|
||||||
@type imageSearch: C{regex}
|
|
||||||
@cvar imageSearch: A compiled regex that will locate the strip image URL
|
|
||||||
when applied to the strip page.
|
|
||||||
@type prevSearch: C{regex}
|
|
||||||
@cvar prevSearch: A compiled regex that will locate the URL for the
|
|
||||||
previous strip when applied to a strip page.
|
|
||||||
'''
|
|
||||||
referrer = None
|
|
||||||
help = 'Sorry, no help for this comic yet.'
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize internal variables."""
|
|
||||||
self.currentUrl = None
|
|
||||||
self.urls = set()
|
|
||||||
|
|
||||||
def getReferrer(self, imageUrl, pageUrl):
|
|
||||||
"""Return referrer for HTTP connection."""
|
|
||||||
return self.referrer or pageUrl or self.getLatestUrl()
|
|
||||||
|
|
||||||
def getComic(self, url, pageUrl):
|
|
||||||
"""Get comic downloader for given URL and page."""
|
|
||||||
if not url:
|
|
||||||
return None
|
|
||||||
return Comic(self.get_name(), url, filename=self.getFilename(url, pageUrl), referrer=self.getReferrer(url, pageUrl))
|
|
||||||
|
|
||||||
def getCurrentComics(self):
|
|
||||||
"""Get list of current comics."""
|
|
||||||
self.currentUrl = self.getLatestUrl()
|
|
||||||
comics = self.getNextComics()
|
|
||||||
if not comics:
|
|
||||||
raise ValueError("Could not find current comic.")
|
|
||||||
return comics
|
|
||||||
|
|
||||||
def getNextComics(self):
|
|
||||||
"""Get all next comics."""
|
|
||||||
comics = []
|
|
||||||
while not comics and self.currentUrl and self.currentUrl not in self.urls:
|
|
||||||
comicUrlGroups, prevUrl = fetchManyUrls(self.currentUrl, [self.imageSearch, self.prevSearch])
|
|
||||||
|
|
||||||
if prevUrl:
|
|
||||||
prevUrl = prevUrl[0]
|
|
||||||
else:
|
|
||||||
prevUrl = None
|
|
||||||
|
|
||||||
for comicUrl in comicUrlGroups:
|
|
||||||
comics.append(self.getComic(comicUrl, self.currentUrl))
|
|
||||||
|
|
||||||
self.urls.update([self.currentUrl])
|
|
||||||
self.currentUrl = (prevUrl, None)[prevUrl in self.urls]
|
|
||||||
return comics
|
|
||||||
|
|
||||||
def setStrip(self, index):
|
|
||||||
"""Set current comic strip URL."""
|
|
||||||
self.currentUrl = self.imageUrl % index
|
|
||||||
|
|
||||||
def getHelp(self):
|
|
||||||
"""Return help text for this scraper."""
|
|
||||||
return self.help
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
"""Iterate through the strips, starting from the current one and going backward."""
|
|
||||||
if not self.currentUrl:
|
|
||||||
self.currentUrl = self.getLatestUrl()
|
|
||||||
comics = True
|
|
||||||
while comics:
|
|
||||||
comics = self.getNextComics()
|
|
||||||
if comics:
|
|
||||||
yield comics
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_name(cls):
|
|
||||||
"""Get scraper name."""
|
|
||||||
if hasattr(cls, 'name'):
|
|
||||||
return cls.name
|
|
||||||
return cls.__name__
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def starter(cls):
|
|
||||||
"""Get starter URL from where to scrape comic strips."""
|
|
||||||
return cls.latestUrl
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def namer(cls, imageUrl, pageUrl):
|
|
||||||
"""Return filename for given image and page URL."""
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getFilename(self, imageUrl, pageUrl):
|
|
||||||
"""Return filename for given image and page URL."""
|
|
||||||
return self.namer(imageUrl, pageUrl)
|
|
||||||
|
|
||||||
def getLatestUrl(self):
|
|
||||||
"""Get starter URL from where to scrape comic strips."""
|
|
||||||
return self.starter()
|
|
||||||
|
|
||||||
|
|
||||||
def queryNamer(paramName, usePageUrl=False):
|
def queryNamer(paramName, usePageUrl=False):
|
||||||
"""Get name from URL query part."""
|
"""Get name from URL query part."""
|
||||||
|
|
74
dosagelib/loader.py
Normal file
74
dosagelib/loader.py
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
# -*- coding: iso-8859-1 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def get_modules(folder, importprefix):
|
||||||
|
"""Find all valid modules in the plugins directory. A valid module
|
||||||
|
must have a .py extension, and is importable.
|
||||||
|
@return: all loaded valid modules
|
||||||
|
@rtype: iterator of module
|
||||||
|
"""
|
||||||
|
for filename in get_importable_modules(folder):
|
||||||
|
try:
|
||||||
|
module = load_module(filename, importprefix)
|
||||||
|
if module is not None:
|
||||||
|
yield module
|
||||||
|
except StandardError, msg:
|
||||||
|
print "ERROR: could not load module %s: %s" % (filename, msg)
|
||||||
|
|
||||||
|
|
||||||
|
def get_importable_modules(folder):
|
||||||
|
"""Find all module files in the given folder that end witn '.py' and
|
||||||
|
don't start with an underscore.
|
||||||
|
@return module filenames
|
||||||
|
@rtype: iterator of string
|
||||||
|
"""
|
||||||
|
for fname in os.listdir(folder):
|
||||||
|
if fname.endswith('.py') and not fname.startswith('_'):
|
||||||
|
yield os.path.join(folder, fname)
|
||||||
|
|
||||||
|
|
||||||
|
def load_module(filename, importprefix):
|
||||||
|
"""Load and return the module given by the filename.
|
||||||
|
Other exceptions than ImportError are not catched.
|
||||||
|
@return: loaded module or None on import errors
|
||||||
|
@rtype: module or None
|
||||||
|
"""
|
||||||
|
name = os.path.splitext(os.path.basename(filename))[0]
|
||||||
|
modulename = "%s%s" % (importprefix, name)
|
||||||
|
__import__(modulename)
|
||||||
|
return sys.modules[modulename]
|
||||||
|
|
||||||
|
|
||||||
|
def get_plugins(modules, classobj):
|
||||||
|
"""Find all scrapers in all modules.
|
||||||
|
@param modules: the modules to search
|
||||||
|
@ptype modules: iterator of modules
|
||||||
|
@return: found scrapers
|
||||||
|
@rytpe: iterator of class objects
|
||||||
|
"""
|
||||||
|
for module in modules:
|
||||||
|
for plugin in get_module_plugins(module, classobj):
|
||||||
|
yield plugin
|
||||||
|
|
||||||
|
|
||||||
|
def get_module_plugins(module, classobj):
|
||||||
|
"""Return all subclasses of _BasicScraper in the module.
|
||||||
|
If the module defines __all__, only those entries will be searched,
|
||||||
|
otherwise all objects not starting with '_' will be searched.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
names = module.__all__
|
||||||
|
except AttributeError:
|
||||||
|
names = [x for x in vars(module) if not x.startswith('_')]
|
||||||
|
for name in names:
|
||||||
|
try:
|
||||||
|
obj = getattr(module, name)
|
||||||
|
except AttributeError:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
if issubclass(obj, classobj):
|
||||||
|
yield obj
|
||||||
|
except TypeError:
|
||||||
|
continue
|
|
@ -2,14 +2,14 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, MULTILINE
|
from re import compile, MULTILINE
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
|
from ..scraper import _BasicScraper
|
||||||
from ..helpers import _BasicScraper, regexNamer, bounceStarter, indirectStarter
|
from ..helpers import regexNamer, bounceStarter, indirectStarter
|
||||||
|
|
||||||
|
|
||||||
class ALessonIsLearned(_BasicScraper):
|
class ALessonIsLearned(_BasicScraper):
|
||||||
latestUrl = 'http://www.alessonislearned.com/'
|
latestUrl = 'http://www.alessonislearned.com/'
|
||||||
imageUrl = 'http://www.alessonislearned.com/lesson%s.html'
|
imageUrl = 'http://www.alessonislearned.com/lesson%s.html'
|
||||||
imageSearch = compile(tagre("img", "src", r"(cmx/.+?)"))
|
imageSearch = compile(tagre("img", "src", r"(cmx/lesson.+?)"))
|
||||||
prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=.+?)")+r".+?previous")
|
prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=.+?)")+r".+?previous")
|
||||||
help = 'Index format: nnn'
|
help = 'Index format: nnn'
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
class BadlyDrawnKitties(_BasicScraper):
|
class BadlyDrawnKitties(_BasicScraper):
|
||||||
|
|
|
@ -2,18 +2,23 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..helpers import (
|
from ..scraper import _BasicScraper
|
||||||
_BasicScraper, constStarter, bounceStarter, indirectStarter)
|
from ..helpers import constStarter, bounceStarter, indirectStarter
|
||||||
from ..util import getQueryParams
|
from ..util import tagre, getQueryParams
|
||||||
|
|
||||||
|
|
||||||
class CalvinAndHobbes(_BasicScraper):
|
class CalvinAndHobbes(_BasicScraper):
|
||||||
latestUrl = 'http://www.gocomics.com/calvinandhobbes/'
|
starter = bounceStarter('http://www.gocomics.com/calvinandhobbes/',
|
||||||
|
compile(tagre("a", "href", "(/calvinandhobbes/\d+/\d+/\d+)")+"Next feature</a>"))
|
||||||
imageUrl = 'http://www.gocomics.com/calvinandhobbes/%s'
|
imageUrl = 'http://www.gocomics.com/calvinandhobbes/%s'
|
||||||
imageSearch = compile(r'src="(http://picayune\.uclick\.com/comics/ch/[^"]+\.gif)"')
|
imageSearch = compile(tagre("img", "src", "(http://assets\.amuniversal\.com/[a-f0-9]+)"))
|
||||||
prevSearch = compile(r'href="(.*?)"\s+onclick="[^"]*">Previous day</a>')
|
prevSearch = compile(tagre("a", "href", "(/calvinandhobbes/\d+/\d+/\d+)")+"Previous feature</a>")
|
||||||
help = 'Index format: yyyy/mm/dd'
|
help = 'Index format: yyyy/mm/dd'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
prefix, year, month, day = pageUrl.rsplit('/', 3)
|
||||||
|
return "%s%s%s.gif" % (year, month, day)
|
||||||
|
|
||||||
|
|
||||||
class CandyCartoon(_BasicScraper):
|
class CandyCartoon(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE, MULTILINE
|
from re import compile, IGNORECASE, MULTILINE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, bounceStarter, indirectStarter
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import bounceStarter, indirectStarter
|
||||||
from ..util import getQueryParams
|
from ..util import getQueryParams
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, bounceStarter, queryNamer
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import bounceStarter, queryNamer
|
||||||
|
|
||||||
|
|
||||||
def drunkDuck(shortName):
|
def drunkDuck(shortName):
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, indirectStarter
|
from ..helpers import indirectStarter
|
||||||
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
class EerieCuties(_BasicScraper):
|
class EerieCuties(_BasicScraper):
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: iso-8859-1 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE, MULTILINE
|
from re import compile, IGNORECASE, MULTILINE
|
||||||
from ..util import tagre
|
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, indirectStarter
|
from ..util import tagre
|
||||||
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import indirectStarter
|
||||||
|
|
||||||
|
|
||||||
class FalconTwin(_BasicScraper):
|
class FalconTwin(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, indirectStarter
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import indirectStarter
|
||||||
|
|
||||||
|
|
||||||
class Galaxion(_BasicScraper):
|
class Galaxion(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
class HappyMedium(_BasicScraper):
|
class HappyMedium(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
class IDreamOfAJeanieBottle(_BasicScraper):
|
class IDreamOfAJeanieBottle(_BasicScraper):
|
||||||
|
|
|
@ -2,8 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, MULTILINE
|
from re import compile, MULTILINE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Jack(_BasicScraper):
|
class Jack(_BasicScraper):
|
||||||
|
|
|
@ -2,8 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class KernelPanic(_BasicScraper):
|
class KernelPanic(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
def keenSpot(comics):
|
def keenSpot(comics):
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, indirectStarter
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import indirectStarter
|
||||||
|
|
||||||
|
|
||||||
class LasLindas(_BasicScraper):
|
class LasLindas(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, queryNamer
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import queryNamer
|
||||||
|
|
||||||
|
|
||||||
class MadamAndEve(_BasicScraper):
|
class MadamAndEve(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, indirectStarter, _PHPScraper
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import indirectStarter, _PHPScraper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class NineteenNinetySeven(_BasicScraper):
|
class NineteenNinetySeven(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, indirectStarter
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import indirectStarter
|
||||||
|
|
||||||
|
|
||||||
class OctopusPie(_BasicScraper):
|
class OctopusPie(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, bounceStarter, queryNamer
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import bounceStarter, queryNamer
|
||||||
|
|
||||||
|
|
||||||
class PartiallyClips(_BasicScraper):
|
class PartiallyClips(_BasicScraper):
|
||||||
|
|
|
@ -2,8 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class QuestionableContent(_BasicScraper):
|
class QuestionableContent(_BasicScraper):
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, bounceStarter
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import bounceStarter
|
||||||
|
|
||||||
|
|
||||||
class RadioactivePanda(_BasicScraper):
|
class RadioactivePanda(_BasicScraper):
|
||||||
|
|
|
@ -3,7 +3,8 @@
|
||||||
from re import compile, MULTILINE, IGNORECASE, sub
|
from re import compile, MULTILINE, IGNORECASE, sub
|
||||||
from os.path import splitext
|
from os.path import splitext
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, bounceStarter, indirectStarter
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import bounceStarter, indirectStarter
|
||||||
|
|
||||||
|
|
||||||
class SailorsunOrg(_BasicScraper):
|
class SailorsunOrg(_BasicScraper):
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, indirectStarter
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import indirectStarter
|
||||||
|
|
||||||
|
|
||||||
class TalesOfPylea(_BasicScraper):
|
class TalesOfPylea(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, bounceStarter, indirectStarter
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import bounceStarter, indirectStarter
|
||||||
from ..util import getQueryParams
|
from ..util import getQueryParams
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE, sub
|
from re import compile, IGNORECASE, sub
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
from ..util import fetchManyMatches, fetchUrl
|
from ..util import fetchUrl
|
||||||
|
|
||||||
|
|
||||||
class _UClickScraper(_BasicScraper):
|
class _UClickScraper(_BasicScraper):
|
||||||
|
@ -24,6 +24,7 @@ class _UClickScraper(_BasicScraper):
|
||||||
'index',
|
'index',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# XXX refactor this mess
|
||||||
submoduleSearch = compile(r'(<A HREF="http://content.uclick.com/content/\w+.html">[^>]+?</a>)', IGNORECASE)
|
submoduleSearch = compile(r'(<A HREF="http://content.uclick.com/content/\w+.html">[^>]+?</a>)', IGNORECASE)
|
||||||
partsMatch = compile(r'<A HREF="http://content.uclick.com/content/(\w+?).html">([^>]+?)</a>', IGNORECASE)
|
partsMatch = compile(r'<A HREF="http://content.uclick.com/content/(\w+?).html">([^>]+?)</a>', IGNORECASE)
|
||||||
matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]
|
matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]
|
||||||
|
|
|
@ -2,8 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE, MULTILINE
|
from re import compile, IGNORECASE, MULTILINE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class _VGCats(_BasicScraper):
|
class _VGCats(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, IGNORECASE, DOTALL
|
from re import compile, IGNORECASE, DOTALL
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, queryNamer, bounceStarter
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import queryNamer, bounceStarter
|
||||||
|
|
||||||
|
|
||||||
class WayfarersMoon(_BasicScraper):
|
class WayfarersMoon(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,9 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..helpers import _BasicScraper, bounceStarter
|
from ..scraper import _BasicScraper
|
||||||
|
from ..helpers import bounceStarter
|
||||||
|
|
||||||
|
|
||||||
class xkcd(_BasicScraper):
|
class xkcd(_BasicScraper):
|
||||||
starter = bounceStarter('http://xkcd.com/', compile(r'<a rel="next" href="(/?\d+/?)"[^>]*>Next'))
|
starter = bounceStarter('http://xkcd.com/', compile(r'<a rel="next" href="(/?\d+/?)"[^>]*>Next'))
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile, MULTILINE
|
from re import compile, MULTILINE
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
class YAFGC(_BasicScraper):
|
class YAFGC(_BasicScraper):
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..helpers import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
|
|
||||||
|
|
||||||
class Zapiro(_BasicScraper):
|
class Zapiro(_BasicScraper):
|
||||||
|
|
|
@ -2,15 +2,14 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012 Bastian Kleineidam
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
import os
|
import os
|
||||||
import sys
|
from . import loader
|
||||||
|
from .util import fetchUrls
|
||||||
from .helpers import _BasicScraper
|
from .comic import ComicStrip
|
||||||
|
|
||||||
disabled = []
|
disabled = []
|
||||||
def init_disabled():
|
def init_disabled():
|
||||||
filename = os.path.expanduser('~/.dosage/disabled')
|
filename = os.path.expanduser('~/.dosage/disabled')
|
||||||
if not os.path.isfile(filename):
|
if os.path.isfile(filename):
|
||||||
return
|
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
if line and not line.startswith('#'):
|
if line and not line.startswith('#'):
|
||||||
|
@ -21,28 +20,104 @@ class DisabledComicError(ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def get(comicName):
|
class _BasicScraper(object):
|
||||||
|
'''Base class with scrape functions for comics.
|
||||||
|
|
||||||
|
@type latestUrl: C{string}
|
||||||
|
@cvar latestUrl: The URL for the latest comic strip.
|
||||||
|
@type imageUrl: C{string}
|
||||||
|
@cvar imageUrl: A string that is interpolated with the strip index
|
||||||
|
to yield the URL for a particular strip.
|
||||||
|
@type imageSearch: C{regex}
|
||||||
|
@cvar imageSearch: A compiled regex that will locate the strip image URL
|
||||||
|
when applied to the strip page.
|
||||||
|
@type prevSearch: C{regex}
|
||||||
|
@cvar prevSearch: A compiled regex that will locate the URL for the
|
||||||
|
previous strip when applied to a strip page.
|
||||||
|
'''
|
||||||
|
help = 'Sorry, no help for this comic yet.'
|
||||||
|
|
||||||
|
def __init__(self, indices=None):
|
||||||
|
"""Initialize internal variables."""
|
||||||
|
self.urls = set()
|
||||||
|
self.indices = indices
|
||||||
|
|
||||||
|
def getCurrentStrip(self):
|
||||||
|
"""Get current comic strip."""
|
||||||
|
return self.getStrip(self.getLatestUrl())
|
||||||
|
|
||||||
|
def getStrip(self, url):
|
||||||
|
"""Get comic strip for given URL."""
|
||||||
|
imageUrls = fetchUrls(url, self.imageSearch)
|
||||||
|
return self.getComicStrip(url, imageUrls)
|
||||||
|
|
||||||
|
def getComicStrip(self, url, imageUrls):
|
||||||
|
"""Get comic strip downloader for given URL and images."""
|
||||||
|
return ComicStrip(self.get_name(), url, imageUrls, self.namer)
|
||||||
|
|
||||||
|
def getAllStrips(self):
|
||||||
|
"""Get all comic strips."""
|
||||||
|
seen_urls = set()
|
||||||
|
url = self.getLatestUrl()
|
||||||
|
while url:
|
||||||
|
imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
|
||||||
|
seen_urls.add(url)
|
||||||
|
yield self.getComicStrip(url, imageUrls)
|
||||||
|
# avoid recursive URL loops
|
||||||
|
url = prevUrl if prevUrl not in seen_urls else None
|
||||||
|
|
||||||
|
def setStrip(self, index):
|
||||||
|
"""Set current comic strip URL."""
|
||||||
|
self.currentUrl = self.imageUrl % index
|
||||||
|
|
||||||
|
def getHelp(self):
|
||||||
|
"""Return help text for this scraper."""
|
||||||
|
return self.help
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_name(cls):
|
||||||
|
"""Get scraper name."""
|
||||||
|
if hasattr(cls, 'name'):
|
||||||
|
return cls.name
|
||||||
|
return cls.__name__
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def starter(cls):
|
||||||
|
"""Get starter URL from where to scrape comic strips."""
|
||||||
|
return cls.latestUrl
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
"""Return filename for given image and page URL."""
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getFilename(self, imageUrl, pageUrl):
|
||||||
|
"""Return filename for given image and page URL."""
|
||||||
|
return self.namer(imageUrl, pageUrl)
|
||||||
|
|
||||||
|
def getLatestUrl(self):
|
||||||
|
"""Get starter URL from where to scrape comic strips."""
|
||||||
|
return self.starter()
|
||||||
|
|
||||||
|
|
||||||
|
def get_scraper(comic):
|
||||||
"""Returns a comic module object."""
|
"""Returns a comic module object."""
|
||||||
candidates = []
|
candidates = []
|
||||||
for scraper in get_scrapers():
|
cname = comic.lower()
|
||||||
lname = scraper.get_name().lower()
|
for scraperclass in get_scrapers():
|
||||||
cname = comicName.lower()
|
lname = scraperclass.get_name().lower()
|
||||||
if lname == cname:
|
if lname == cname:
|
||||||
# perfect match
|
# perfect match
|
||||||
return scraper
|
return scraperclass
|
||||||
if cname in lname:
|
if cname in lname:
|
||||||
candidates.append(scraper)
|
candidates.append(scraperclass)
|
||||||
if len(candidates) == 1:
|
if len(candidates) == 1:
|
||||||
return candidates[0]
|
return candidates[0]
|
||||||
elif candidates:
|
elif candidates:
|
||||||
comics = ", ".join(x.get_name() for x in candidates)
|
comics = ", ".join(x.get_name() for x in candidates)
|
||||||
raise ValueError('Multiple comics %s found.' % comics)
|
raise ValueError('Multiple comics %s found.' % comics)
|
||||||
else:
|
else:
|
||||||
raise ValueError('Comic %r not found.' % comicName)
|
raise ValueError('Comic %r not found.' % comic)
|
||||||
|
|
||||||
|
|
||||||
def items():
|
|
||||||
return get_scrapers()
|
|
||||||
|
|
||||||
|
|
||||||
_scrapers = None
|
_scrapers = None
|
||||||
|
@ -54,91 +129,23 @@ def get_scrapers():
|
||||||
"""
|
"""
|
||||||
global _scrapers
|
global _scrapers
|
||||||
if _scrapers is None:
|
if _scrapers is None:
|
||||||
_scrapers = list(get_all_plugins(get_modules()))
|
folder = os.path.join(os.path.dirname(__file__), 'plugins')
|
||||||
|
importprefix = 'dosagelib.plugins.'
|
||||||
|
modules = loader.get_modules(folder, importprefix)
|
||||||
|
plugins = loader.get_plugins(modules, _BasicScraper)
|
||||||
|
_scrapers = list(plugins)
|
||||||
_scrapers.sort(key=lambda s: s.get_name())
|
_scrapers.sort(key=lambda s: s.get_name())
|
||||||
check_scrapers()
|
check_scrapers()
|
||||||
return _scrapers
|
return _scrapers
|
||||||
|
|
||||||
|
|
||||||
def check_scrapers():
|
def check_scrapers():
|
||||||
|
"""Check for duplicate scraper class names."""
|
||||||
d = {}
|
d = {}
|
||||||
for s in _scrapers:
|
for scraperclass in _scrapers:
|
||||||
name = s.get_name().lower()
|
name = scraperclass.get_name().lower()
|
||||||
if name in d:
|
if name in d:
|
||||||
name1 = s.get_name()
|
name1 = scraperclass.get_name()
|
||||||
name2 = d[name].get_name()
|
name2 = d[name].get_name()
|
||||||
raise ValueError('Duplicate scrapers %s and %s found' % (name1, name2))
|
raise ValueError('Duplicate scrapers %s and %s found' % (name1, name2))
|
||||||
d[name] = s
|
d[name] = scraperclass
|
||||||
|
|
||||||
|
|
||||||
def get_modules():
|
|
||||||
"""Find all valid modules in the plugins directory. A valid module
|
|
||||||
must have a .py extension, and is importable.
|
|
||||||
@return: all loaded valid modules
|
|
||||||
@rtype: iterator of module
|
|
||||||
"""
|
|
||||||
# load from the plugins folder
|
|
||||||
folder = os.path.join(os.path.dirname(__file__), 'plugins')
|
|
||||||
for filename in get_importable_modules(folder):
|
|
||||||
try:
|
|
||||||
module = load_module(filename)
|
|
||||||
if module is not None:
|
|
||||||
yield module
|
|
||||||
except StandardError, msg:
|
|
||||||
print "ERROR", msg
|
|
||||||
|
|
||||||
|
|
||||||
def get_importable_modules(folder):
|
|
||||||
"""Find all module files in the given folder that end witn '.py' and
|
|
||||||
don't start with an underscore.
|
|
||||||
@return module filenames
|
|
||||||
@rtype: iterator of string
|
|
||||||
"""
|
|
||||||
for fname in os.listdir(folder):
|
|
||||||
if fname.endswith('.py') and not fname.startswith('_'):
|
|
||||||
yield os.path.join(folder, fname)
|
|
||||||
|
|
||||||
|
|
||||||
def load_module(filename):
|
|
||||||
"""Load and return the module given by the filename.
|
|
||||||
Other exceptions than ImportError are not catched.
|
|
||||||
@return: loaded module or None on import errors
|
|
||||||
@rtype: module or None
|
|
||||||
"""
|
|
||||||
name = os.path.splitext(os.path.basename(filename))[0]
|
|
||||||
modulename = "dosagelib.plugins.%s" % name
|
|
||||||
__import__(modulename)
|
|
||||||
return sys.modules[modulename]
|
|
||||||
|
|
||||||
|
|
||||||
def get_all_plugins(modules):
|
|
||||||
"""Find all scrapers in all modules.
|
|
||||||
@param modules: the modules to search
|
|
||||||
@ptype modules: iterator of modules
|
|
||||||
@return: found scrapers
|
|
||||||
@rytpe: iterator of class objects
|
|
||||||
"""
|
|
||||||
for module in modules:
|
|
||||||
for plugin in get_plugins(module):
|
|
||||||
yield plugin
|
|
||||||
|
|
||||||
|
|
||||||
def get_plugins(module):
|
|
||||||
"""Return all subclasses of _BasicScraper in the module.
|
|
||||||
If the module defines __all__, only those entries will be searched,
|
|
||||||
otherwise all objects not starting with '_' will be searched.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
names = module.__all__
|
|
||||||
except AttributeError:
|
|
||||||
names = [x for x in vars(module) if not x.startswith('_')]
|
|
||||||
for name in names:
|
|
||||||
try:
|
|
||||||
obj = getattr(module, name)
|
|
||||||
except AttributeError:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
if issubclass(obj, _BasicScraper):
|
|
||||||
yield obj
|
|
||||||
except TypeError:
|
|
||||||
continue
|
|
||||||
|
|
|
@ -21,72 +21,90 @@ if os.name == 'nt':
|
||||||
|
|
||||||
has_curses = has_module("curses")
|
has_curses = has_module("curses")
|
||||||
|
|
||||||
class NoMatchError(Exception):
|
MAX_FILESIZE = 1024*1024*1 # 1MB
|
||||||
pass
|
|
||||||
|
|
||||||
def getMatchValues(matches):
|
def tagre(tag, attribute, value):
|
||||||
return set([match.group(1) for match in matches])
|
"""Return a regular expression matching the given HTML tag, attribute
|
||||||
|
and value. It matches the tag and attribute names case insensitive,
|
||||||
|
and skips arbitrary whitespace and leading HTML attributes. The "<>" at
|
||||||
|
the start and end of the HTML tag is also matched.
|
||||||
|
@param tag: the tag name
|
||||||
|
@ptype tag: string
|
||||||
|
@param attribute: the attribute name
|
||||||
|
@ptype attribute: string
|
||||||
|
@param value: the attribute value
|
||||||
|
@ptype value: string
|
||||||
|
@return: the generated regular expression suitable for re.compile()
|
||||||
|
@rtype: string
|
||||||
|
"""
|
||||||
|
attrs = dict(
|
||||||
|
tag=case_insensitive_re(tag),
|
||||||
|
attribute=case_insensitive_re(attribute),
|
||||||
|
value=value,
|
||||||
|
)
|
||||||
|
return r'<\s*%(tag)s[^>]*\s+%(attribute)s\s*=\s*"%(value)s"[^>]*/?>' % attrs
|
||||||
|
|
||||||
def fetchManyMatches(url, regexes):
|
|
||||||
'''Returns a list containing lists of matches for each regular expression, in the same order.'''
|
def case_insensitive_re(name):
|
||||||
out.write('Matching regex(es) %r multiple times against %s...' % ([rex.pattern for rex in regexes], url), 2)
|
"""Reformat the given name to a case insensitive regular expression string
|
||||||
|
without using re.IGNORECASE. This way selective strings can be made case
|
||||||
|
insensitive.
|
||||||
|
@param name: the name to make case insensitive
|
||||||
|
@ptype name: string
|
||||||
|
@return: the case insenstive regex
|
||||||
|
@rtype: string
|
||||||
|
"""
|
||||||
|
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
||||||
|
|
||||||
|
|
||||||
|
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
||||||
|
|
||||||
|
def getPageContent(url):
|
||||||
|
# read page data
|
||||||
page = urlopen(url)
|
page = urlopen(url)
|
||||||
data = page.read()
|
data = page.read(MAX_FILESIZE)
|
||||||
|
# determine base URL
|
||||||
matches = [getMatchValues(regex.finditer(data)) for regex in regexes]
|
baseUrl = None
|
||||||
if matches:
|
match = baseSearch.search(data)
|
||||||
out.write('...found %r' % (matches,), 2)
|
|
||||||
else:
|
|
||||||
out.write('...not found!', 2)
|
|
||||||
|
|
||||||
return list(matches)
|
|
||||||
|
|
||||||
def fetchMatches(url, regexes):
|
|
||||||
out.write('Matching regex(es) %r against %s...' % ([rex.pattern for rex in regexes], url), 2)
|
|
||||||
page = urlopen(url)
|
|
||||||
data = page.read()
|
|
||||||
|
|
||||||
matches = []
|
|
||||||
for regex in regexes:
|
|
||||||
match = regex.search(data)
|
|
||||||
if match:
|
if match:
|
||||||
matches.append(match.group(1))
|
baseUrl = match.group(1)
|
||||||
|
|
||||||
if matches:
|
|
||||||
out.write('...found %r' % (matches,), 2)
|
|
||||||
else:
|
else:
|
||||||
out.write('...not found!', 2)
|
baseUrl = url
|
||||||
|
return data, baseUrl
|
||||||
|
|
||||||
return matches
|
|
||||||
|
|
||||||
def fetchMatch(url, regex):
|
def fetchUrl(url, searchRo):
|
||||||
matches = fetchMatches(url, (regex,))
|
data, baseUrl = getPageContent(url)
|
||||||
if matches:
|
match = searchRo.search(data)
|
||||||
return matches[0]
|
if match:
|
||||||
|
searchUrl = match.group(1)
|
||||||
|
out.write('matched URL %r' % searchUrl, 2)
|
||||||
|
return urlparse.urljoin(baseUrl, searchUrl)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def fetchUrl(url, regex):
|
|
||||||
match = fetchMatch(url, regex)
|
def fetchUrls(url, imageSearch, prevSearch=None):
|
||||||
|
data, baseUrl = getPageContent(url)
|
||||||
|
# match images
|
||||||
|
imageUrls = set()
|
||||||
|
for match in imageSearch.finditer(data):
|
||||||
|
imageUrl = match.group(1)
|
||||||
|
out.write('matched image URL %r' % imageUrl, 2)
|
||||||
|
imageUrls.add(urlparse.urljoin(baseUrl, imageUrl))
|
||||||
|
if not imageUrls:
|
||||||
|
raise ValueError("No images found at %s with pattern %s" % (url, imageSearch.pattern))
|
||||||
|
if prevSearch is not None:
|
||||||
|
# match previous URL
|
||||||
|
match = prevSearch.search(data)
|
||||||
if match:
|
if match:
|
||||||
return urlparse.urljoin(url, match)
|
prevUrl = match.group(1)
|
||||||
return None
|
out.write('matched previous URL %r' % prevUrl, 2)
|
||||||
|
prevUrl = urlparse.urljoin(baseUrl, prevUrl)
|
||||||
|
else:
|
||||||
|
prevUrl = None
|
||||||
|
return imageUrls, prevUrl
|
||||||
|
return imageUrls
|
||||||
|
|
||||||
baseSearch = re.compile(r'<base\s+href="([^"]*)"\s+/?>', re.IGNORECASE)
|
|
||||||
def fetchUrls(url, regexes):
|
|
||||||
matches = fetchMatches(url, [baseSearch] + list(regexes))
|
|
||||||
baseUrl = matches.pop(0) or url
|
|
||||||
return [urlparse.urljoin(baseUrl, match) for match in matches]
|
|
||||||
|
|
||||||
def fetchManyUrls(url, regexes):
|
|
||||||
matchGroups = fetchManyMatches(url, [baseSearch] + list(regexes))
|
|
||||||
baseUrl = matchGroups.pop(0) or [url]
|
|
||||||
baseUrl = baseUrl[0]
|
|
||||||
|
|
||||||
xformedGroups = []
|
|
||||||
for matchGroup in matchGroups:
|
|
||||||
xformedGroups.append([urlparse.urljoin(baseUrl, match) for match in matchGroup])
|
|
||||||
|
|
||||||
return xformedGroups
|
|
||||||
|
|
||||||
def _unescape(text):
|
def _unescape(text):
|
||||||
"""
|
"""
|
||||||
|
@ -278,37 +296,3 @@ def strtimezone():
|
||||||
else:
|
else:
|
||||||
zone = time.timezone
|
zone = time.timezone
|
||||||
return "%+04d" % (-zone//3600)
|
return "%+04d" % (-zone//3600)
|
||||||
|
|
||||||
|
|
||||||
def tagre(tag, attribute, value):
|
|
||||||
"""Return a regular expression matching the given HTML tag, attribute
|
|
||||||
and value. It matches the tag and attribute names case insensitive,
|
|
||||||
and skips arbitrary whitespace and leading HTML attributes. The "<>" at
|
|
||||||
the start and end of the HTML tag is also matched.
|
|
||||||
@param tag: the tag name
|
|
||||||
@ptype tag: string
|
|
||||||
@param attribute: the attribute name
|
|
||||||
@ptype attribute: string
|
|
||||||
@param value: the attribute value
|
|
||||||
@ptype value: string
|
|
||||||
@return: the generated regular expression suitable for re.compile()
|
|
||||||
@rtype: string
|
|
||||||
"""
|
|
||||||
attrs = dict(
|
|
||||||
tag=case_insensitive_re(tag),
|
|
||||||
attribute=case_insensitive_re(attribute),
|
|
||||||
value=value,
|
|
||||||
)
|
|
||||||
return r'<\s*%(tag)s[^>]*\s+%(attribute)s\s*=\s*"%(value)s"[^>]>' % attrs
|
|
||||||
|
|
||||||
def case_insensitive_re(name):
|
|
||||||
"""Reformat the given name to a case insensitive regular expression string
|
|
||||||
without using re.IGNORECASE. This way selective strings can be made case
|
|
||||||
insensitive.
|
|
||||||
@param name: the name to make case insensitive
|
|
||||||
@ptype name: string
|
|
||||||
@return: the case insenstive regex
|
|
||||||
@rtype: string
|
|
||||||
"""
|
|
||||||
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue