@ -11,7 +11,7 @@ NUMPROCESSORS:=$(shell grep -c processor /proc/cpuinfo)
# - write test results in file
# - run all tests found in the "tests" subdirectory
PYTESTOPTS:=-n $(NUMPROCESSORS) --resultlog=$(TESTOUTPUT) --tb=short
PYTESTOPTS?=-n $(NUMPROCESSORS) --resultlog=$(TESTOUTPUT) --tb=short
# directory or file with tests to run
TESTS ?= tests
@ -1,9 +1,14 @@
all: man testresults.html
man: $(MANFILES)
dosage.1.html: dosage.1
man2html -r $< | tail -n +2 | sed 's/Time:.*//g' | sed 's@/:@/@g' > $@
# patch --no-backup-if-mismatch --quiet $@ dosage.1.html.diff
testresults.html: ../testresults.txt
../scripts/ $< > $@
.PHONY: all man
@ -21,7 +21,7 @@ you may be infringing upon various copyrights.
List available comics (over 3500 at the moment):
List available comics (over 3000 at the moment):
`$ dosage -l`
Normal file
Normal file
@ -0,0 +1,18 @@
body {
font-family: open-sans, sans-serif;
.item {
width: 100px;
margin: 10px;
float: left;
color: #119911;
color: #992200;
Normal file
Normal file
@ -9,7 +9,7 @@ Section: User Commands (1)<BR><A HREF="#index">Index</A>
<A NAME="lbAB"> </A>
dosage - comic strip downloader
dosage - a commandline webcomic downloader and archiver
<A NAME="lbAC"> </A>
window.onload = function() {
var wall = new Masonry( document.getElementById('container'), {
columnWidth: 240
@ -24,7 +24,7 @@ import optparse
from dosagelib import events, scraper
from dosagelib.output import out
from dosagelib.util import get_columns, internal_error
from dosagelib.util import get_columns, internal_error, getDirname
from dosagelib.configuration import App, Freeware, Copyright, SupportUrl
def setupOptions():
@ -35,7 +35,8 @@ def setupOptions():
usage = 'usage: %prog [options] comicModule [comicModule ...]'
parser = optparse.OptionParser(usage=usage)
parser.add_option('-v', '--verbose', action='count', dest='verbose', default=0, help='provides verbose output, use multiple times for more verbosity')
parser.add_option('-a', '--all', action='count', dest='all', default=None, help='traverse and retrieve all available comics')
parser.add_option('-n', '--numstrips', action='store', dest='numstrips', type='int', default=0, help='traverse and retrieve the given number of comic strips; use --all to retrieve all comic strips')
parser.add_option('-a', '--all', action='store_true', dest='all', default=None, help='traverse and retrieve all comic strips')
parser.add_option('-b', '--basepath', action='store', dest='basepath', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH')
parser.add_option('--baseurl', action='store', dest='baseurl', default=None, help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH')
parser.add_option('-l', '--list', action='store_const', const=1, dest='list', help='list available comic modules')
@ -73,20 +74,26 @@ def saveComicStrip(strip, basepath):
if saved:
allskipped = False
except IOError as msg:
out.write('Error saving %s: %s' % (image.filename, msg))
out.error('Could not save %s: %s' % (image.filename, msg))
errors += 1
return errors, allskipped
def displayHelp(comics, basepath):
"""Print help for comic strips."""
for scraperobj in getScrapers(comics, basepath):
for line in scraperobj.getHelp().splitlines():
out.write("Help: "+line)
||||"Help: "+line)
except ValueError as msg:
return 1
return 0
def getComics(options, comics):
"""Retrieve given comics."""
# XXX refactor
errors = 0
if options.output:
events.installHandler(options.output, options.basepath, options.baseurl)
@ -95,6 +102,8 @@ def getComics(options, comics):
out.context = scraperobj.get_name()
if options.all:
strips = scraperobj.getAllStrips()
elif options.numstrips:
strips = scraperobj.getAllStrips(options.numstrips)
strips = scraperobj.getCurrentStrips()
first = True
@ -105,12 +114,13 @@ def getComics(options, comics):
if not first and scraperobj.indexes:
# stop when indexed retrieval skipped all images for one
# comie strip (except the first one)
out.write("Stop retrieval because image file already exists")
||||"Stop retrieval because image file already exists")
first = False
except IOError as msg:
out.write('Error getting strip: %s' % msg)
except (ValueError, IOError) as msg:
errors += 1
return errors
@ -123,26 +133,22 @@ def run(options, comics):
if options.list:
return doList(options.list == 1)
if len(comics) <= 0:
out.write('Warning: No comics specified, bailing out!')
out.warn('No comics specified, bailing out!')
return 1
if options.modhelp:
return displayHelp(comics, options.basepath)
return getComics(options, comics)
except ValueError as msg:
out.write("Error: %s" % msg)
return 1
def doList(columnList):
"""List available comics."""
out.write('Available comic scrapers:')
||||'Available comic scrapers:')
scrapers = getScrapers(['@@'])
if columnList:
num = doColumnList(scrapers)
num = doSingleList(scrapers)
out.write('%d supported comics.' % num)
||||'%d supported comics.' % num)
return 0
@ -171,9 +177,9 @@ def getScrapers(comics, basepath=None):
if '@' in comics:
# only scrapers whose directory already exists
if len(comics) > 1:
out.write("WARN: using '@' as comic name ignores all other specified comics.\n")
out.warn("using '@' as comic name ignores all other specified comics.")
for scraperclass in scraper.get_scrapers():
dirname = scraperclass.get_name().replace('/', os.sep)
dirname = getDirname(scraperclass.get_name())
if os.path.isdir(os.path.join(basepath, dirname)):
yield scraperclass()
elif '@@' in comics:
@ -181,7 +187,9 @@ def getScrapers(comics, basepath=None):
for scraperclass in scraper.get_scrapers():
yield scraperclass()
# only selected
# get only selected comic scrapers
# store them in a list to catch naming errors early
scrapers = []
for comic in comics:
if ':' in comic:
name, index = comic.split(':', 1)
@ -189,14 +197,19 @@ def getScrapers(comics, basepath=None):
name = comic
indexes = None
yield scraper.get_scraper(name)(indexes=indexes)
for s in scrapers:
yield s
def main():
"""Parse options and execute commands."""
parser = setupOptions()
options, args = parser.parse_args()
res = run(options, args)
# eliminate duplicate comic names
comics = set(args)
res = run(options, comics)
except KeyboardInterrupt:
res = 1
@ -7,7 +7,7 @@ import rfc822
import time
from .output import out
from .util import getImageObject, normaliseURL, unquote, strsize
from .util import getImageObject, normaliseURL, unquote, strsize, getDirname, getFilename
from .events import getHandler
class FetchComicError(IOError):
@ -34,20 +34,21 @@ class ComicStrip(object):
filename = self.namer(url, self.stripUrl)
if filename is None:
filename = url.rsplit('/', 1)[1]
return ComicImage(, url, self.stripUrl, filename)
dirname = getDirname(
return ComicImage(, url, self.stripUrl, dirname, filename)
class ComicImage(object):
"""A comic image downloader."""
def __init__(self, name, url, referrer, filename):
def __init__(self, name, url, referrer, dirname, filename):
"""Set URL and filename."""
|||| = name
self.referrer = referrer
self.url = url
self.dirname = dirname
filename = getFilename(filename)
self.filename, self.ext = os.path.splitext(filename)
self.filename = self.filename.replace(os.sep, '_')
self.ext = self.ext.replace(os.sep, '_')
def connect(self):
"""Connect to host and get meta information."""
@ -71,7 +72,7 @@ class ComicImage(object):
self.ext = '.' + subtype.replace('jpeg', 'jpg')
self.contentLength = int(self.urlobj.headers.get('content-length', 0))
self.lastModified = self.urlobj.headers.get('last-modified')
out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2)
out.debug('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength))
def touch(self, filename):
"""Set last modified date on filename."""
@ -86,18 +87,18 @@ class ComicImage(object):
filename = "%s%s" % (self.filename, self.ext)
comicSize = self.contentLength
comicDir = os.path.join(basepath,'/', os.sep))
comicDir = os.path.join(basepath, self.dirname)
if not os.path.isdir(comicDir):
fn = os.path.join(comicDir, filename)
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
out.write('Skipping existing file "%s".' % fn, 1)
||||'Skipping existing file "%s".' % fn, 1)
return fn, False
out.write('Writing comic to file %s...' % fn, 3)
out.debug('Writing comic to file %s...' % fn)
with open(fn, 'wb') as comicOut:
@ -107,7 +108,7 @@ class ComicImage(object):
size = strsize(os.path.getsize(fn))
out.write("Saved %s (%s)." % (fn, size), 1)
||||"Saved %s (%s)." % (fn, size), 1)
getHandler().comicDownloaded(, fn)
return fn, True
@ -3,6 +3,7 @@
# Copyright (C) 2012 Bastian Kleineidam
from __future__ import print_function
import time
import sys
class Output(object):
"""Print output with context, indentation and optional timestamps."""
@ -13,7 +14,19 @@ class Output(object):
self.level = 0
self.timestamps = False
def write(self, s, level=0):
def info(self, s, level=0):
self.write(s, level=level)
def debug(self, s):
self.write(s, level=2)
def warn(self, s):
self.write("WARN: %s" % s, file=sys.stderr)
def error(self, s):
self.write("ERROR: %s" % s, file=sys.stderr)
def write(self, s, level=0, file=sys.stdout):
"""Write message with indentation, context and optional timestamp."""
if level > self.level:
@ -21,7 +34,8 @@ class Output(object):
timestamp = time.strftime('%H:%M:%S ')
timestamp = ''
print('%s%s> %s' % (timestamp, self.context, s))
print('%s%s> %s' % (timestamp, self.context, s), file=file)
def writelines(self, lines, level=0):
"""Write multiple messages."""
@ -135,6 +135,7 @@ class AstronomyPOTD(_BasicScraper):
compile(r'<a href="(ap\d{6}\.html)">></a>'))
stripUrl = ''
imageSearch = compile(r'<a href="(image/\d{4}/[^"]+)"')
multipleImagesPerStrip = True
prevSearch = compile(r'<a href="(ap\d{6}\.html)"><</a>')
help = 'Index format: yymmdd'
@ -176,10 +177,6 @@ class AGirlAndHerFed(_BasicScraper):
prevSearch = compile(r'<a href="([^"]+)">[^>]+Back')
help = 'Index format: nnn'
def namer(cls, imageUrl, pageUrl):
return pageUrl.split('?')[-1]
class AetheriaEpics(_BasicScraper):
latestUrl = ''
@ -101,6 +101,10 @@ class BoyOnAStickAndSlither(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'(/page/\d+)') + "<span>Next page")
help = 'Index format: n (unpadded)'
def namer(cls, imageUrl, pageUrl):
return pageUrl.rsplit('/')[-1]
class ButternutSquash(_BasicScraper):
latestUrl = ''
@ -206,12 +210,3 @@ class BetweenFailures(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'(http://betweenfailures\.com/archives/archive/[^"]+)', after="previous"))
help = 'Index format: stripnum-strip-name'
class BillyTheBeaker(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + 'index.php?strip=%s'
multipleImagesPerStrip = True
imageSearch = compile(tagre("img", "src", r'(bub\d+_\d+[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(index\.php\?strip\=[^"]+)', after="Previous strip"))
help = 'Index format: nnn'
@ -26,6 +26,15 @@ class CaribbeanBlue(_BasicScraper):
help = 'Index format: nnn-stripname'
class Catalyst(_BasicScraper):
baseUrl = ""
latestUrl = baseUrl + "comic.php?comic_id=415"
stripUrl = baseUrl + "comic.php?comic_id=%s"
imageSearch = compile(tagre("img", "src", r'(http://catalyst\.spiderforest\.com/comics/[^"]+)'))
prevSearch = compile("<center>" + tagre("a", "href", r'(http://catalyst\.spiderforest\.com/comic\.php\?comic_id=\d+)'))
help = 'Index format: number'
class Catena(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + '%s'
@ -98,6 +107,14 @@ class Commissioned(_BasicScraper):
help = 'Index format: n'
class Concession(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + 'index.php?pid=%s'
imageSearch = compile(tagre("img", "src", r'(http://concessioncomic\.com/comics/[^"]+)', after="Comic"))
prevSearch = compile(tagre("a", "href", r'(http://concessioncomic\.com/index\.php\?pid=\d+)', after="nav-prev"))
help = 'Index format: number'
class CoolCatStudio(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + 'strips-cat/ccs%s'
@ -6,7 +6,7 @@ from re import compile
from ..scraper import make_scraper
from ..util import tagre
_imageSearch = compile(tagre("img", "src", r'(/comics/\d+/[^"]+)'))
_imageSearch = compile(tagre("a", "href", r'(/comics/\d+/[^"]+)'))
def add(name, path):
baseurl = ''
@ -9,14 +9,14 @@ from ..helpers import indirectStarter
from ..util import tagre
class DMFA(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + 'Comics/Vol_%s.php'
imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
multipleImagesPerStrip = True
prevSearch = compile(tagre("a", "href", r'((?:Comics/)?Vol[^"]+)')+
tagre("img", "src", r'(?:../)?Images/comicprev\.gif'))
help = 'Index format: nnn (normally, some specials)'
class DailyDose(_BasicScraper):
baseUrl = ''
starter = indirectStarter(baseUrl,
compile(tagre("a", "href", r'(http://dailydoseofcomics\.com/[^"]+)', after="preview")))
stripUrl = baseUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'([^"]+)', before="align(?:none|center)"))
prevSearch = compile(tagre("a", "href", r'(http://dailydoseofcomics\.com/[^"]+)', after="prev"))
help = 'Index format: stripname'
class DandyAndCompany(_BasicScraper):
@ -52,6 +52,16 @@ class DeepFried(_BasicScraper):
help = 'Index format: non'
class DMFA(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + 'Comics/Vol_%s.php'
imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
multipleImagesPerStrip = True
prevSearch = compile(tagre("a", "href", r'((?:Comics/)?Vol[^"]+)')+
tagre("img", "src", r'(?:../)?Images/comicprev\.gif'))
help = 'Index format: nnn (normally, some specials)'
class DoemainOfOurOwn(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + 'index.cgi/%s'
@ -423,7 +423,6 @@ add('Flying_Under_the_Influence')
@ -670,7 +669,6 @@ add('ManBoys')
@ -20,7 +20,7 @@ class EerieCuties(_BasicScraper):
class Eriadan(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.shockdom\.com/webcomics/eriadan/files/[^"]+)', after='alt=""'))
imageSearch = compile(tagre("img", "src", r'(http://www\.shockdom\.com/webcomics/eriadan/files/[^"]+)', after='width="800"'))
prevSearch = compile(tagre("a", "href", r'([^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/nnn (unpadded)'
@ -1,28 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
from re import compile
from ..scraper import make_scraper
from ..util import asciify
_imageSearch = compile(r'SRC="(http://www\.thefallenangel\.co\.uk/\w+comics/.+?)"')
_prevSearch = compile(r' <a href="(http://www\.thefallenangel\.co\.uk/.+?)"><img[^>]+?src="http://www\.thefallenangel\.co\.uk/images/previousday\.jpg"')
def add(name, shortname):
latestUrl = '' % shortname
classname = "FallenAngel_" + asciify(name)
globals()[classname] = make_scraper(classname,
latestUrl = latestUrl,
stripUrl = latestUrl + '?date=%s',
name='FallenAngel/' + name,
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: yyyymmdd',
add('HighMaintenance', 'hm')
add('FAWK', 'fawk')
add('MalloryChan', 'mallorychan')
@ -4,7 +4,7 @@
from re import compile
from ..scraper import make_scraper
from ..util import tagre
from ..util import tagre, quote
from ..helpers import bounceStarter
_imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[0-9a-f]+)'))
@ -23,7 +23,7 @@ def add(name, shortname):
globals()[classname] = make_scraper(classname,
starter = bounceStarter(baseUrl + shortname, _nextSearch),
name='GoComics/' + name,
stripUrl=baseUrl + shortname + '/%s',
stripUrl=baseUrl + quote(shortname) + '/%s',
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help='Index format: yyyy/mm/dd',
@ -433,7 +433,6 @@ add('Rechid', '/rechid')
add('RedMeat', '/redmeat')
add('RedandRover', '/redandrover')
add('ReplyAll', '/replyall')
add('RichardsPoorAlmanac', '/richards-poor-almanac')
add('RipHaywire', '/riphaywire')
add('RipleysBelieveItorNot', '/ripleysbelieveitornot')
add('Risible', '/risible')
@ -16,6 +16,10 @@ _prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
def add(name, url):
classname = 'KeenSpot_%s' % name
if '/d/' in url:
stripUrl = url.split('/d/')[0] + '/d/%s.html'
stripUrl = url + 'd/%s.html'
def _prevUrlModifier(cls, prevUrl):
@ -28,7 +32,7 @@ def add(name, url):
globals()[classname] = make_scraper(classname,
name='KeenSpot/' + name,
stripUrl=url + 'd/%s.html',
imageSearch = _imageSearch,
prevSearch = _prevSearch,
prevUrlModifier = _prevUrlModifier,
@ -153,7 +157,7 @@ add('CameoComic', '')
add('CampAlaska', '')
add('CampusLife', '')
add('CanYouKeepaSecret', '')
add('Candi', '')
add('Candi', '')
add('CanisLupus', '')
add('CaptainGreyhound', '')
add('CaptainMike', '')
@ -195,7 +199,6 @@ add('CornerAlley13', '')
add('CorporateLife', '')
add('CosmicAwareness', '')
add('CosmicDrift', '')
add('Countyoursheep', '')
add('CracklingSilence', '')
add('Crackwalker', '')
add('CreepyHead', '')
@ -241,7 +244,6 @@ add('DoomedUntoEternalVigilanceForever', '')
add('DormSweetDorm', '')
add('DoubleyouTeeEff', '')
add('Downscale', '')
add('DownwardBound', '')
add('Dragon27sBane', '')
add('DragonBallTM', '')
add('DragonBoy', '')
@ -282,7 +284,6 @@ add('EvilWenchesIncorporated', '')
add('EyeoftheMonkey', '')
add('Ezailia', '')
add('Faces', '')
add('FairestandFallen', '')
add('FakingSanity', '')
add('FalseGods', '')
add('FancyThat', '')
@ -303,7 +304,6 @@ add('FireflyCross', '')
add('FiveMinuteComic', '')
add('Fizzle', '')
add('FlinchandFriends', '')
add('FlipandSplog', '')
add('Flounderville', '')
add('FloydCartoons', '')
add('Flunkies', '')
@ -320,11 +320,10 @@ add('FourDays', '')
add('Fourboys', '')
add('Fox27sFreakyAdventures', '')
add('FoxTails', '')
add('Framed', '')
add('FreakU', '')
add('FreakU', '')
add('FreaksandG33k', '')
add('FredtheDot', '')
add('FreeParking', '')
add('FreeParking', '')
add('FromTheAntiCulture', '')
add('FromTheMargin', '')
add('FruitFlies', '')
@ -337,7 +336,6 @@ add('GambitasBishounen', '')
add('GameJumpers', '')
add('GameMisconduct', '')
add('Gameboy', '')
add('GamerPsychotica', '')
add('GamersParadox', '')
add('GamingGuardians', '')
add('GamingReality', '')
@ -361,13 +359,11 @@ add('GorgeousPrincessCreamyBeamy', '')
add('GothyMcGee', '')
add('GratuitousMangaStyle', '')
add('GraveyardShift', '')
add('Gravity', '')
add('GreenLightGo', '')
add('GroundFloor', '')
add('Grumpythefathamster', '')
add('GuiShinTaeChiAKAGhostHunter', '')
add('HERZBLUT', '')
add('HalflightBreaking', '')
add('HangingAround', '')
add('Hans', '')
add('HaypennyRag', '')
@ -412,7 +408,6 @@ add('InappropriateIrving', '')
add('InfiniteSouls', '')
add('InkyorShaggy', '')
add('IntergalacticSpaceSheriffs', '')
add('Inverloch', '')
add('IpsoFacto', '')
add('ItHurtsToBeThatStupid', '')
add('ItsGravy', '')
@ -470,7 +465,6 @@ add('LifeGoesOn', '')
add('LifeinBellCounty', '')
add('LifeisUnfair', '')
add('LifeofBuddha', '')
add('LifeonForbez', '')
add('Lightbringer', '')
add('LikeItIs', '')
add('LilDude', '')
@ -492,7 +486,6 @@ add('LustForFreelance', '')
add('MEHComics', '')
add('MORONS', '')
add('MTranc3', '')
add('MacHall', '')
add('Maddland', '')
add('MadeInHeaven', '')
add('MagiIndustries', '')
@ -512,7 +505,6 @@ add('MelEverymanAndHisSarcasticTalkingHousepetAmbrose', 'http://everyman.comicge
add('MenschunsererZeitGerman', '')
add('Midcentral', '')
add('MiketheMulletThing', '')
add('Mindmistress', '')
add('Mindtap', '')
add('MinimalismSucks', '')
add('MinimumSecurityUniversity', '')
@ -526,7 +518,6 @@ add('MorysEducation', '')
add('MrBoffleandFriends', '')
add('MrBubbles', '')
add('MrFooAdventures', '')
add('MrPinkBlob', '')
add('MrScience', '')
add('Muertitos', '')
add('Muffythelitlerabbit', '')
@ -545,7 +536,6 @@ add('NastyChocolates', '')
add('NeTrek', '')
add('NeedleandThread', '')
add('NekkoandJoruba', '')
add('NekoTheKitty', '')
add('Nekotime', '')
add('Netjeru', '')
add('NeverYouMind', '')
@ -896,7 +886,6 @@ add('Unconventional', '')
add('UnfamiliarReflection', '')
add('UnlifeOnline', '')
add('UnseenFate', '')
add('Untitled', '')
add('UntitledAgain', '')
add('UrbanFable', '')
add('VRPG', '')
@ -993,7 +982,6 @@ add('silvette', '')
add('skimlinescomAcollectionofthings', '')
add('smut', '')
add('socializedmedicine', '')
add('spacejams', '')
add('spiderfrogballoon', '')
add('theadventuresofmegamanandlink', '')
add('theendofthings', '')
@ -27,6 +27,7 @@ class NeoEarth(_BasicScraper):
class NewAdventuresOfBobbin(_BasicScraper):
latestUrl = ''
imageSearch = compile(tagre("a", "href", r'(\d+\.gif)'))
multipleImagesPerStrip = True
prevSearch = None
help = 'Index format: none'
@ -35,7 +35,7 @@ class OnTheEdge(_BasicScraper):
class OneQuestion(_BasicScraper):
latestUrl = ''
latestUrl = ''
stripUrl = latestUrl + 'comic.php?strip_id=%s'
imageSearch = compile(tagre("img", "src", r'(istrip_files/strips/\d+\.jpg)'))
prevSearch = compile(tagre("a", "href", r'(comic\.php\?strip_id=\d+)') + tagre("img", "src", r'img/arrow_prev\.jpg'))
@ -162,7 +162,3 @@ class PlanescapeSurvival(_BasicScraper):
imageSearch = compile(r'src="(comics/.+?)"')
prevSearch = compile(r'<a href="(.+?)"><img alt="Previous" ')
help = 'Index format: nnn'
def namer(cls, imageUrl, pageUrl):
return pageUrl.split('/')[-1].split('.')[0]
@ -10,8 +10,8 @@ from ..util import tagre
class QuestionableContent(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + 'view.php?comic=%s'
imageSearch = compile(r'/(comics/\d+\.png)"')
prevSearch = compile(r'<a href="(view.php\?comic=\d+)">Previous')
imageSearch = compile(tagre("img", "src", r'([^"]+/comics/[^"]+)', before="strip"))
prevSearch = compile(tagre("a", "href", r'(view\.php\?comic=\d+)') + 'Previous')
help = 'Index format: n (unpadded)'
@ -149,10 +149,6 @@ class SomethingPositive(_BasicScraper):
"(?:" + tagre("img", "src", r'images/previous\.gif') + "|Previous)")
help = 'Index format: mmddyyyy'
def namer(cls, imageUrl, pageUrl):
return pageUrl.split('/')[-1].split('.')[0]
class SexyLosers(_BasicScraper):
stripUrl = ''
@ -16,15 +16,17 @@ class TheNoob(_BasicScraper):
help = 'Index format: nnnn'
class TheOrderOfTheStick(_BasicScraper):
latestUrl = ''
stripUrl = ''
imageSearch = compile(r'<IMG src="(/comics/images/.+?)">')
imageSearch = compile(r'<IMG src="(/comics/images/[^"]+)">')
prevSearch = compile(r'<A href="(/comics/oots\d{4}\.html)"><IMG src="/Images/redesign/ComicNav_Back.gif"')
help = 'Index format: n (unpadded)'
starter = indirectStarter('', compile(r'<A href="(/comics/oots\d{4}\.html)"'))
def namer(cls, imageUrl, pageUrl):
return pageUrl.rsplit('/', 1)[-1][:-5]
class TheParkingLotIsFull(_BasicScraper):
@ -36,7 +38,6 @@ class TheParkingLotIsFull(_BasicScraper):
help = 'Index format: nnn'
class TheWotch(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + '?date=%s'
@ -29,7 +29,7 @@ class UnicornJelly(_BasicScraper):
class UserFriendly(_BasicScraper):
starter = bounceStarter('', compile(r'<area shape="rect" href="(/cartoons/\?id=\d{8}&mode=classic)" coords="[\d, ]+?" alt="">'))
stripUrl = ''
imageSearch = compile(r'<img border="0" src="(\d{2}\w{3}/.+?\.gif)"')
imageSearch = compile(r'<img border="0" src="\s*(\d{2}\w{3}/.+?\.gif)"')
prevSearch = compile(r'<area shape="rect" href="(/cartoons/\?id=\d{8}&mode=classic)" coords="[\d, ]+?" alt="Previous Cartoon">')
help = 'Index format: yyyymmdd'
@ -35,3 +35,7 @@ class ViiviJaWagner(_BasicScraper):
imageSearch = compile(tagre("link", "href", r'(http://hs\d+\.snstatic\.fi/webkuva/oletus/[^"]+)', before="image_src"))
prevSearch = compile(tagre("a", "href", r'(/viivijawagner/[^"]+)', before="prev-cm"))
help = 'Index format: none'
def namer(cls, imageUrl, pageUrl):
return imageUrl.split('=')[1]
Normal file
Normal file
@ -0,0 +1,26 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2012 Bastian Kleineidam
from re import compile
from ..scraper import make_scraper
from ..util import tagre
_prevSearch = compile(tagre("a", "href", r'(\?id=\d+)') + tagre("img", "src", r'images/navi-zurueck\.gif'))
_imageSearch = compile(tagre("img", "src", r'([^"]+/img/comic/[^"]+)', after="comicimg"))
def add(name, shortname):
latestUrl = '' % shortname
classname = 'WebcomicEu_%s' % name
globals()[classname] = make_scraper(classname,
name = 'WebcomicEu/' + name,
latestUrl = latestUrl,
stripUrl = latestUrl + '?id=%s',
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: number',
add('TheBessEffect', 'thebesseffect')
add('TheBessEffectEnglish', 'tbe-english')
add('Talandor', 'talandor')
@ -28,5 +28,4 @@ def add(name, subpath):
add('AgnesQuill', 'daveroman/agnes/')
add('MyMuse', 'gc/muse/')
add('NekkoAndJoruba', 'nekkoandjoruba/nekkoandjoruba/')
add('JaxEpoch', 'johngreen/quicken/')
add('ClownSamurai', 'qsamurai/clownsamurai/')
@ -8,7 +8,7 @@ from ..scraper import make_scraper
from ..helpers import bounceStarter
_imageSearch = compile(tagre("img", "src", r'(http://www\.wlpcomics\.com/(?:adult|general)/[^"]+)'))
_imageSearch = compile(tagre("img", "src", r'(http://www\.wlpcomics\.com/(?:adult|general)/[^"]+/comics/[^"]+)'))
_prevSearch = compile(tagre("a", "href", r'(\w+.html)') + 'Previous')
_nextSearch = compile(tagre("a", "href", r'(\w+.html)') + 'Next')
@ -8,6 +8,14 @@ from ..util import tagre
from ..helpers import bounceStarter
class ZapComic(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://www\.zapcomic\.com\?comic_object=\d+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.zapcomic\.com/[^"]+)', after="previous-comic-link"))
help = 'Index format: yyyy/mm/nnn-stripname'
class Zapiro(_BasicScraper):
baseUrl = ''
starter = bounceStarter(baseUrl,
@ -23,6 +31,14 @@ class Zapiro(_BasicScraper):
return name
class ZebraGirl(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + '?date=%s'
imageSearch = compile(tagre("img", "src", r"(comics/[^']+)", quote="'"))
prevSearch = compile(tagre("link", "href", r"(/\?date=[^']+)", quote="'", before='Previous'))
help = 'Index format: yyyy-mm-dd'
class ZombieHunters(_BasicScraper):
latestUrl = ''
stripUrl = latestUrl + '?strip_id=%s'
@ -43,7 +43,7 @@ class _BasicScraper(object):
msg = 'Retrieving the current strip'
if self.indexes:
msg += " for indexes %s" % self.indexes
if self.indexes:
for index in self.indexes:
url = self.stripUrl % index
@ -55,40 +55,48 @@ class _BasicScraper(object):
"""Get comic strip for given URL."""
imageUrls = fetchUrls(url, self.imageSearch)[0]
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
raise ValueError("found %d images with %s" % (len(imageUrls), self.imageSearch.pattern))
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
return self.getComicStrip(url, imageUrls)
def getComicStrip(self, url, imageUrls):
"""Get comic strip downloader for given URL and images."""
return ComicStrip(self.get_name(), url, imageUrls, self.namer)
def getAllStrips(self):
def getAllStrips(self, maxstrips=None):
"""Get all comic strips."""
if maxstrips:
msg = 'Retrieving %d strips' % maxstrips
elif self.indexes:
msg += "Retrieving %d strips for indexes %s" % (len(self.indexes), self.indexes)
msg = 'Retrieving all strips'
if self.indexes:
msg += " for indexes %s" % self.indexes
if self.indexes:
for index in self.indexes:
url = self.stripUrl % index
for strip in self.getAllStripsFor(url):
for strip in self.getStripsFor(url, 1):
yield strip
url = self.getLatestUrl()
for strip in self.getAllStripsFor(url):
for strip in self.getStripsFor(url, maxstrips):
yield strip
def getAllStripsFor(self, url):
"""Get all comic strips for an URL."""
def getStripsFor(self, url, maxstrips):
"""Get comic strips for an URL. If maxstrips is a positive number, stop after
retrieving the given number of strips."""
seen_urls = set()
while url:
imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
prevUrl = self.prevUrlModifier(prevUrl)
out.write("Matched previous URL %s" % prevUrl, 2)
out.debug("Matched previous URL %s" % prevUrl)
yield self.getComicStrip(url, imageUrls)
# avoid recursive URL loops
url = prevUrl if prevUrl not in seen_urls else None
if maxstrips is not None:
maxstrips -= 1
if maxstrips <= 0:
def setStrip(self, index):
"""Set current comic strip URL."""
@ -161,13 +169,13 @@ def get_scrapers():
global _scrapers
if _scrapers is None:
out.write("Loading comic modules...", 2)
out.debug("Loading comic modules...")
modules = loader.get_modules()
plugins = loader.get_plugins(modules, _BasicScraper)
_scrapers = list(plugins)
_scrapers.sort(key=lambda s: s.get_name())
out.write("... %d modules loaded." % len(_scrapers), 2)
out.debug("... %d modules loaded." % len(_scrapers))
return _scrapers
@ -27,6 +27,8 @@ MaxContentBytes = 1024 * 1024 * 2 # 2 MB
# Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
# Default connection timeout
ConnectionTimeoutSecs = 60
def tagre(tag, attribute, value, quote='"', before="", after=""):
"""Return a regular expression matching the given HTML tag, attribute
@ -102,7 +104,7 @@ def fetchUrl(url, urlSearch):
searchUrl =
if not searchUrl:
raise ValueError("Match empty URL at %s with pattern %s" % (url, urlSearch.pattern))
out.write('matched URL %r' % searchUrl, 2)
out.debug('matched URL %r' % searchUrl)
return normaliseURL(urlparse.urljoin(baseUrl, searchUrl))
return None
@ -115,10 +117,10 @@ def fetchUrls(url, imageSearch, prevSearch=None):
imageUrl =
if not imageUrl:
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
out.write('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern), 2)
out.debug('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern))
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
if not imageUrls:
out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern))
out.warn("no images found at %s with pattern %s" % (url, imageSearch.pattern))
if prevSearch is not None:
# match previous URL
match =
@ -128,7 +130,7 @@ def fetchUrls(url, imageSearch, prevSearch=None):
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl))
out.write('no previous URL %s at %s' % (prevSearch.pattern, url), 2)
out.debug('no previous URL %s at %s' % (prevSearch.pattern, url))
prevUrl = None
return imageUrls, prevUrl
return imageUrls, None
@ -183,8 +185,9 @@ def normaliseURL(url):
return urlparse.urlunparse(pu)
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None):
out.write('Open URL %s' % url, 2)
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None,
out.debug('Open URL %s' % url)
assert retries >= 0, 'invalid retry value %r' % retries
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
headers = {'User-Agent': UserAgent}
@ -192,13 +195,12 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_byt
if referrer:
headers['Referer'] = referrer
req = requests.get(url, headers=headers, config=config, prefetch=False)
req = requests.get(url, headers=headers, config=config, prefetch=False, timeout=timeout)
check_content_size(url, req.headers, max_content_bytes)
return req
except requests.exceptions.RequestException as err:
msg = 'URL retrieval of %s failed: %s' % (url, err)
raise IOError(msg)
def check_content_size(url, headers, max_content_bytes):
@ -251,7 +253,7 @@ def getRelativePath(basepath, path):
def getQueryParams(url):
query = urlparse.urlsplit(url)[3]
out.write('Extracting query parameters from %r (%r)...' % (url, query), 3)
out.debug('Extracting query parameters from %r (%r)...' % (url, query))
return cgi.parse_qs(query)
@ -334,10 +336,16 @@ def asciify(name):
def unquote(text):
while '%' in text:
text = urllib.unquote(text)
newtext = urllib.unquote(text)
if newtext == text:
text = newtext
return text
def quote(text):
return urllib.quote(text)
def strsize (b):
"""Return human representation of bytes b. A negative number of bytes
raises a value error."""
@ -357,3 +365,20 @@ def strsize (b):
return "%.2fGB" % (float(b) / (1024*1024*1024))
return "%.1fGB" % (float(b) / (1024*1024*1024))
def getDirname(name):
"""Replace slashes with path separator of name."""
return name.replace('/', os.sep)
def getFilename(name):
# first replace all illegal chars
name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
# then remove double dots and underscores
while ".." in name:
name = name.replace('..', '.')
while "__" in name:
name = name.replace('__', '_')
# remove a leading dot or minus
if name.startswith((".", "-")):
name = name[1:]
return name
@ -47,6 +47,7 @@ exclude_comics = [
"Emerald_Winter", # broken images
"Enter_the_Duck_2", # broken images
"ffff", # broken images
"Found_Art", # broken images
"Function_Over_Fashion", # broken images
"Funday_Morning", # broken images
"greys_journey", # broken images
@ -69,6 +70,7 @@ exclude_comics = [
"Louder_Than_Bombs", # broken images
"Lucky_Dawg", # broken images
"Mario_in_Johto", # broken images
"Mary_Sue_Academy", # borken images
"Master", # start page requires login
"Mastermind_BTRN", # broken images
"MAYA_____The_legend_of_Wolf", # broken images
@ -35,6 +35,7 @@ exclude_comics = [
"OysterWar", # too few comics
"PIGTIMES", # comic unavailable
"PS", # comic unavailable
"RichardsPoorAlmanac", # missing images
"SherpaAid", # comic unavailable
"SparComics", # comic unavailable
File diff suppressed because one or more lines are too long
@ -72,6 +72,7 @@ exclude_comics = [
"ComicMischief", # page moved
"ComputerGameAddicts", # page moved
"Concession", # page moved
"Countyoursheep", # broken links
"CorridorZ", # page does not follow standard layout
"CrashBoomMagic", # page moved
"CrazySlowlyGoing", # page has 403 forbidden
@ -85,6 +86,7 @@ exclude_comics = [
"DimBulbComics", # page is gone
"DIVE", # page is gone
"DominicDeegan", # page moved
"DownwardBound", # page does not follow standard layout
"DungeonDamage", # page does not follow standard layout
"Dylan", # page has 403 forbidden
"EarthRiser", # redirects to a new page
@ -99,6 +101,7 @@ exclude_comics = [
"Evilish", # page moved
"EvolBara", # page is gone
"FaerieTales", # page does not follow standard layout
"FairestandFallen", # page does not follow standard layout
"FairyTaleNewVillage", # missing images
"Fate27sTear", # page moved
"FaultyLogic", # page does not follow standard layout
@ -107,9 +110,12 @@ exclude_comics = [
"Flatwood", # page moved
"FLEMComics", # page moved
"FletchersCave", # page is broken
"FlipandSplog", # page does not follow standard layout
"ForcesofGoodandEvil", # page does not follow standard layout
"Framed", # page does not follow standard layout
"FurryBlackDevil", # page moved
"Galacticus", # page has 403 forbidden
"GamerPsychotica", # page does not follow standard layout
"GeebasonParade", # page does not follow standard layout
"geeks", # page moved
"GeminiBright", # page does not follow standard layout
@ -119,9 +125,11 @@ exclude_comics = [
"GODLIKE", # page has 403 forbidden
"GoForIt", # page is gone
"GothBoy", # page moved
"Gravity", # page does not follow standard layout
"Grimage", # page moved
"GrossePointeDogs", # page is broken
"GUComics", # page moved
"HalflightBreaking", # page does not follow standard layout
"HardUnderbelly", # page does not follow standard layout
"HazardousScience", # page is gone
"HereThereBeDragons", # page moved
@ -138,6 +146,7 @@ exclude_comics = [
"InsideJoke", # page is gone
"InsidetheBox", # page has 403 forbidden
"InternationalHopeFoundation", # page does not follow standard layout
"Inverloch", # page does not follow standard layout
"JamieandNick", # page moved
"JasonLovesHisGrandpa", # page is gone
"JavanteasFate", # page is gone
@ -165,8 +174,10 @@ exclude_comics = [
"LinktotheBoards", # page does not follow standard layout
"LinT", # page moved
"LiterallySpeaking", # page does not follow standard layout
"LifeonForbez", # missing images
"LoxieAndZoot", # page does not follow standard layout
"Lunchtable", # missing images
"MacHall", # page does not follow standard layout
"MadWorld", # page has 403 forbidden
"Magellan", # page does not follow standard layout
"Marachan", # missing images
@ -175,13 +186,16 @@ exclude_comics = [
"Meiosis", # page moved
"Michikomonogatari", # page does not follow standard layout
"MidnorthFlourCo", # page has 403 forbidden
"Mindmistress", # page does not follow standard layout
"MintCondition", # page moved
"MisadventuresinPhysics", # page has 403 forbidden
"MobileMadness", # page does not follow standard layout
"MrPinkBlob", # page does not follow standard layout
"MyAngelYouAreAngel", # page is gone
"MyBrainHurts", # page does not follow standard layout
"NAFTANorthAmericanFreeToonAgreementalsoYankuckcanee", # page does not follow standard layout
"NeglectedMarioCharacterComix", # page does not follow standard layout
"NekoTheKitty", # page does not follow standard layout
"Nemutionjewel", # page does not follow standard layout
"Nerdgasm", # missing images
"Nerdz", # page is gone
@ -249,6 +263,7 @@ exclude_comics = [
"SoManyLevels", # page moved
"SomethingSoft", # page is gone
"Sorcery101", # page moved
"spacejams", # page does not follow standard layout
"SpellBinder", # page is gone
"SPQRBlues", # page moved
"StationV3", # page moved
@ -294,6 +309,7 @@ exclude_comics = [
"TwoEvilScientists", # page moved
"TwoLumps", # page moved
"TwoSidesWide", # page moved
"Untitled", # page does not follow standard layout
"Vendetta", # page moved
"VictimsoftheSystem", # page moved
"Victor", # page moved
@ -318,23 +334,19 @@ url_overrides = {
"AmazonSpaceRangers": "",
"ArroganceinSimplicity": "",
"ATasteofEvil": "",
'Candi': '',
"CanYouKeepaSecret": "",
"CapturetheMoment": "",
"CornerAlley13": "",
"Countyoursheep": "",
"FreakU": "",
"FreeParking": "",
"GamerPsychotica": "",
"FreakU": "",
"FreeParking": "",
"GoneAstray": "",
"GoodnEvil": "",
"HalflightBreaking": "",
"HealerOnFeatheredWings": "",
"HowNottoRunAComic": "",
"HurricaneParty": "",
"MacHall": "",
"MaryQuiteContrary": "",
"MoonCrest24": "",
"MrPinkBlob": "",
"NekkoandJoruba": "",
"No4thWalltoBreak": "",
"OtakuKyokai": "",
@ -345,7 +357,6 @@ url_overrides = {
"PlanetsCollide": "",
"RuneMaster": "",
"ShinobiHigh": "",
"spacejams": "",
"TheAdventuresofVindibuddSuperheroInTraining": "",
"TriumphantLosers": "",
"Zortic": "",
@ -19,6 +19,7 @@ htmltemplate = """
<link rel="stylesheet" href="css/main.css">
<link rel="stylesheet" href="css/dosage.css">
<script src="js/masonry.min.js"></script>
<script src=""></script>
<p>Dosage test results from %(date)s</p>
@ -85,6 +86,8 @@ def get_content(filename):
res = []
for name, url in tests:
css = name.split()[-1].lower()
if len(name) > 25 and '/' in name:
name = name.replace('/', '/ ')
if url:
inner = '<a href="%s" class="%s">%s</a>' % (url, css, name)
@ -94,7 +97,7 @@ def get_content(filename):
def main(args):
filename = "testresults.txt"
filename = args[0]
modified = get_mtime(filename)
content = get_content(filename)
attrs = {"date": strdate(modified), "content": content}
@ -7,5 +7,5 @@
rm -f "$script"
echo "#!/bin/sh -e" > "$script"
egrep -v "^\. " testresults.txt | egrep "^F " | cut -b "3-" | sort | awk '{ print "make test TESTOUTPUT=/dev/null TESTS=" $0; }' >> "$script"
egrep -v "^\. " testresults.txt | egrep "^F " | cut -b "3-" | sort | awk '{ print "make test PYTESTOPTS=--tb=short TESTS=" $0; }' >> "$script"
chmod 755 "$script"
@ -36,13 +36,13 @@ class _ComicTester(TestCase):
num = 0
max_strips = 5
for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
images = 0
images = []
for image in strip.getImages():
images += 1
self.check(images > 0, 'failed to find images at %s' % strip.stripUrl)
self.check(images, 'failed to find images at %s' % strip.stripUrl)
if not self.scraperclass.multipleImagesPerStrip:
self.check(images == 1, 'found %d instead of 1 image at %s' % (images, strip.stripUrl))
self.check(len(images) == 1, 'found more than 1 image at %s: %s' % (strip.stripUrl, images))
if num > 0 and self.scraperclass.prevUrlMatchesStripUrl:
num += 1
Reference in a new issue