Added some comic strips and cleanup the scraper code.

This commit is contained in:
Bastian Kleineidam 2013-03-06 20:00:30 +01:00
parent 6091138481
commit bae2a96d8b
31 changed files with 296 additions and 128 deletions

View file

@ -1,14 +1,14 @@
Dosage Dosage
======= =======
Dosage is a commandline webcomic downloader and archiver. Dosage is a commandline comic downloader and archiver.
Introduction Introduction
------------- -------------
Dosage is designed to keep a local copy of specific webcomics Dosage is designed to keep a local copy of specific webcomics
and other picture-based content such as Picture of the Day sites. and other picture-based content such as Picture of the Day sites.
With the dosage commandline script you can get the latest strip of With the dosage commandline script you can get the latest strip of
webcomic, or catch-up to the last strip downloaded, or download a a webcomic, or catch-up to the last strip downloaded, or download a
strip for a particular date/index (if the webcomic's site layout strip for a particular date/index (if the webcomic's site layout
makes this possible). makes this possible).
@ -91,7 +91,7 @@ Technical Description
Dosage is written in Python and relies on regular expressions to Dosage is written in Python and relies on regular expressions to
do most of the grunt work. do most of the grunt work.
For each webcomic Dosage has a plugin module, found in the "plugins" For each comic Dosage has a plugin module, found in the "plugins"
subdirectory of the dosagelib directory. Each module is a subclass of subdirectory of the dosagelib directory. Each module is a subclass of
the _BasicComic class and specifies where to download its comic images. the _BasicComic class and specifies where to download its comic images.
Some comic syndicates (GoComics for example) have a standard layout for all Some comic syndicates (GoComics for example) have a standard layout for all
@ -100,7 +100,7 @@ instances from a given list of comic strips.
Extending Dosage Extending Dosage
----------------- -----------------
In order to add a new webcomic, a new module class has to be created in In order to add a new comic, a new module class has to be created in
one of the *.py files in the dosagelib/plugins subdirectory. one of the *.py files in the dosagelib/plugins subdirectory.
Look at the existing module classes for examples. Look at the existing module classes for examples.

View file

@ -1,5 +1,10 @@
Dosage 1.13 (released xx.xx.2013) Dosage 1.13 (released xx.xx.2013)
Features:
- comics: Added comic strips AxeCop, Bearmageddon, DeadWinter,
HarkAVagrant, IAmArg, LoadingArtist, Nnewts, PHDComics, PokeyThePenguin,
SnowFlame and WorldOfMrToast.
Fixes: Fixes:
- cmdline: Catch error when piping output to another - cmdline: Catch error when piping output to another
program or file under Windows. program or file under Windows.

View file

@ -1,6 +1,6 @@
.TH DOSAGE 1 .TH DOSAGE 1
.SH NAME .SH NAME
dosage \- a commandline webcomic downloader and archiver dosage \- a commandline comic downloader and archiver
.SH SYNOPSIS .SH SYNOPSIS
\fBdosage\fP [\fIoptions\fP] \fImodule\fP... \fBdosage\fP [\fIoptions\fP] \fImodule\fP...
.SH DESCRIPTION .SH DESCRIPTION
@ -128,7 +128,7 @@ Retrieve the Penny Arcade strip for a given index:
Retrieve Calvin and Hobbes strips from a given index going backwards to Retrieve Calvin and Hobbes strips from a given index going backwards to
the beginning. the beginning.
.RS .RS
.B dosage \-a calvinandhobbes:20120722 .B dosage \-a calvinandhobbes:2012/07/22
.RE .RE
.PP .PP
On Unix, \fBxargs(1)\fP can download several comic strips in parallel, On Unix, \fBxargs(1)\fP can download several comic strips in parallel,

View file

@ -9,7 +9,7 @@ Section: User Commands (1)<BR><A HREF="#index">Index</A>
<A NAME="lbAB">&nbsp;</A> <A NAME="lbAB">&nbsp;</A>
<H2>NAME</H2> <H2>NAME</H2>
dosage - a commandline webcomic downloader and archiver dosage - a commandline comic downloader and archiver
<A NAME="lbAC">&nbsp;</A> <A NAME="lbAC">&nbsp;</A>
<H2>SYNOPSIS</H2> <H2>SYNOPSIS</H2>
@ -174,7 +174,7 @@ Retrieve the Penny Arcade strip for a given index:
Retrieve Calvin and Hobbes strips from a given index going backwards to Retrieve Calvin and Hobbes strips from a given index going backwards to
the beginning. the beginning.
<DL COMPACT><DT><DD> <DL COMPACT><DT><DD>
<B>dosage -a calvinandhobbes:20120722</B> <B>dosage -a calvinandhobbes:2012/07/22</B>
</DL> </DL>

View file

@ -3,7 +3,7 @@ DOSAGE(1) DOSAGE(1)
NAME NAME
dosage - a commandline webcomic downloader and archiver dosage - a commandline comic downloader and archiver
SYNOPSIS SYNOPSIS
dosage [options] module... dosage [options] module...
@ -116,7 +116,7 @@ EXAMPLES
Retrieve Calvin and Hobbes strips from a given index going Retrieve Calvin and Hobbes strips from a given index going
backwards to the beginning. backwards to the beginning.
dosage -a calvinandhobbes:20120722 dosage -a calvinandhobbes:2012/07/22
On Unix, xargs(1) can download several comic strips in paral On Unix, xargs(1) can download several comic strips in paral
lel, for example using up to 4 processes: lel, for example using up to 4 processes:

24
dosage
View file

@ -1,6 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
# Dosage, the webcomic downloader
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam # Copyright (C) 2012-2013 Bastian Kleineidam
from __future__ import print_function from __future__ import print_function
@ -56,7 +55,7 @@ def setupOptions():
@rtype argparse.ArgumentParser @rtype argparse.ArgumentParser
""" """
kwargs = dict( kwargs = dict(
description = "A commandline webcomic downloader and archiver.", description = "A commandline comic downloader and archiver.",
epilog = Examples, epilog = Examples,
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
) )
@ -131,7 +130,7 @@ def displayHelp(comics):
def displayComicHelp(scraperobj): def displayComicHelp(scraperobj):
"""Print description and help for a comic.""" """Print description and help for a comic."""
out.context = scraperobj.get_name() out.context = scraperobj.getName()
try: try:
if scraperobj.description: if scraperobj.description:
out.info("Description: " + scraperobj.description) out.info("Description: " + scraperobj.description)
@ -163,15 +162,16 @@ def getComics(options):
def getStrips(scraperobj, options): def getStrips(scraperobj, options):
"""Get all strips from a scraper.""" """Get all strips from a scraper."""
errors = 0 errors = 0
out.context = scraperobj.get_name() out.context = scraperobj.getName()
if options.all: if options.all:
strips = scraperobj.getAllStrips() numstrips = None
elif options.numstrips: elif options.numstrips:
strips = scraperobj.getAllStrips(options.numstrips) numstrips = options.numstrips
else: else:
strips = scraperobj.getCurrentStrips() # get current strip
numstrips = 1
try: try:
for strip in strips: for strip in scraperobj.getStrips(numstrips):
_errors, skipped = saveComicStrip(strip, options.basepath) _errors, skipped = saveComicStrip(strip, options.basepath)
errors += _errors errors += _errors
if skipped and options.cont: if skipped and options.cont:
@ -206,7 +206,7 @@ def doList(columnList=True):
"""List available comics.""" """List available comics."""
out.info('Available comic scrapers:') out.info('Available comic scrapers:')
out.info('Comics marked with [A] require age confirmation with the --adult option.') out.info('Comics marked with [A] require age confirmation with the --adult option.')
scrapers = sorted(getScrapers(['@@']), key=lambda s: s.get_name()) scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName())
try: try:
if columnList: if columnList:
num = doColumnList(scrapers) num = doColumnList(scrapers)
@ -243,7 +243,7 @@ def doColumnList(scrapers):
def getScraperName(scraperobj, limit=None): def getScraperName(scraperobj, limit=None):
"""Get comic scraper name.""" """Get comic scraper name."""
suffix = " [A]" if scraperobj.adult else "" suffix = " [A]" if scraperobj.adult else ""
name = scraperobj.get_name() name = scraperobj.getName()
if limit is not None: if limit is not None:
name = strlimit(name, limit) name = strlimit(name, limit)
return name + suffix return name + suffix
@ -259,7 +259,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
if not adult and scraperclass.adult: if not adult and scraperclass.adult:
warn_adult(scraperclass) warn_adult(scraperclass)
continue continue
dirname = getDirname(scraperclass.get_name()) dirname = getDirname(scraperclass.getName())
if os.path.isdir(os.path.join(basepath, dirname)): if os.path.isdir(os.path.join(basepath, dirname)):
yield scraperclass() yield scraperclass()
elif '@@' in comics: elif '@@' in comics:
@ -293,7 +293,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
def warn_adult(scraperclass): def warn_adult(scraperclass):
"""Print warning about adult content.""" """Print warning about adult content."""
out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.get_name()) out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())
def main(): def main():

View file

@ -1,8 +1,8 @@
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012-2013 Bastian Kleineidam
""" """
Automated webcomic downloader. Dosage traverses webcomic websites in Automated comic downloader. Dosage traverses comic websites in
order to download each strip of the comic. The intended use is for order to download each strip of the comic. The intended use is for
mirroring the strips locally for ease of viewing; redistribution of the mirroring the strips locally for ease of viewing; redistribution of the
downloaded strips may violate copyright, and is not advisable unless you downloaded strips may violate copyright, and is not advisable unless you
@ -11,7 +11,7 @@ your intentions, and received permission to distribute.
The primary dosage interface is currently the 'mainline' script, which The primary dosage interface is currently the 'mainline' script, which
is just a thin wrapper that invokes L{dosage.mainline}. Comic modules is just a thin wrapper that invokes L{dosage.mainline}. Comic modules
for each webcomic are located in L{dosage.modules}; most of these make for each comic are located in L{dosage.modules}; most of these make
use of the helper base classes and mixins in L{dosage.modules.helpers}, use of the helper base classes and mixins in L{dosage.modules.helpers},
thus making their individual implementations trivial. thus making their individual implementations trivial.
""" """

View file

@ -88,6 +88,7 @@ class ComicImage(object):
def save(self, basepath): def save(self, basepath):
"""Save comic URL to filename on disk.""" """Save comic URL to filename on disk."""
out.info("Get image URL %s" % self.url, level=1)
self.connect() self.connect()
filename = "%s%s" % (self.filename, self.ext) filename = "%s%s" % (self.filename, self.ext)
comicSize = self.contentLength comicSize = self.contentLength
@ -96,6 +97,7 @@ class ComicImage(object):
os.makedirs(comicDir) os.makedirs(comicDir)
fn = os.path.join(comicDir, filename) fn = os.path.join(comicDir, filename)
# compare with >= since comicSize could be the compressed size
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize: if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
self.touch(fn) self.touch(fn)
out.info('Skipping existing file "%s".' % fn) out.info('Skipping existing file "%s".' % fn)

View file

@ -159,13 +159,16 @@ class AstronomyPOTD(_BasicScraper):
stripUrl = 'http://antwrp.gsfc.nasa.gov/apod/ap%s.html' stripUrl = 'http://antwrp.gsfc.nasa.gov/apod/ap%s.html'
imageSearch = compile(tagre("a", "href", r'(image/\d{4}/[^"]+)')) imageSearch = compile(tagre("a", "href", r'(image/\d{4}/[^"]+)'))
multipleImagesPerStrip = True multipleImagesPerStrip = True
noImageUrls = set([ prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "&lt;</a>")
help = 'Index format: yymmdd'
def shouldSkipUrl(self, url):
"""Skip pages without images."""
return url in (
'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video 'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video
'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video 'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video
'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video 'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video
]) )
prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "&lt;</a>")
help = 'Index format: yymmdd'
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):
@ -269,3 +272,14 @@ class Annyseed(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(Annyseed[^"]+)')) imageSearch = compile(tagre("img", "src", r'(Annyseed[^"]+)'))
prevSearch = compile(r'<a href="(http://www\.colourofivy\.com/[^"]+)"><img src="Last.gif"') prevSearch = compile(r'<a href="(http://www\.colourofivy\.com/[^"]+)"><img src="Last.gif"')
help = 'Index format: nnn' help = 'Index format: nnn'
class AxeCop(_BasicScraper):
url = 'http://axecop.com/'
starter = indirectStarter(url, compile(tagre("a", "href", r'(http://axecop\.com/index\.php/acepisodes/read/episode_\d+/)')))
stripUrl = url + 'index.php/acepisodes/read/episode_%s/'
firstStripUrl = stripUrl % '0'
imageSearch = compile(tagre("img", "src", r'(http://axecop\.com/images/uploads/axecop[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://axecop\.com/index\.php/acepisodes/read/episode_\d+/)') +
tagre("img", "src", r'http://axecop\.com/acimages/buttons/page_left\.png'))
help = 'Index format: number'

View file

@ -25,6 +25,15 @@ class Bardsworth(_BasicScraper):
help = 'Index format: nnn' help = 'Index format: nnn'
class Bearmageddon(_BasicScraper):
url = 'http://bearmageddon.com/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2011/08/01/page-1'
imageSearch = compile(tagre("img", "src", r'(http://bearmageddon\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://bearmageddon\.com/\d+/\d+/\d+/[^"]+)', after='navi-prev'))
help = 'Index format: yyyy/mm/dd/stripname'
class BetterDays(_BasicScraper): class BetterDays(_BasicScraper):
url = 'http://jaynaylor.com/betterdays/' url = 'http://jaynaylor.com/betterdays/'
stripUrl = url + 'archives/%s.html' stripUrl = url + 'archives/%s.html'
@ -119,6 +128,16 @@ class BrentalFlossGuest(BrentalFloss):
stripUrl = url + '?id=%s' stripUrl = url + '?id=%s'
# XXX disallowed by robots.txt
class _BringBackRoomies(_BasicScraper):
url = "http://www.bringbackroomies.com/"
stripUrl = url + "comic/%s"
imageSearch = compile(tagre("img", "src", r'(http://www\.bringbackroomies\.com/wp-content/uploads/\d+/\d+/[^"]+)'))
prevSearch = compile(tagre("span", "class", "mininav-prev") +
tagre("a", "href", r'(http://www\.bringbackroomies\.com/comic/[^"]+)'))
help = 'Index format: stripname'
class Brink(_BasicScraper): class Brink(_BasicScraper):
url = 'http://paperfangs.com/brink/' url = 'http://paperfangs.com/brink/'
stripUrl = url + '?p=%s' stripUrl = url + '?p=%s'

View file

@ -209,9 +209,12 @@ class CyanideAndHappiness(_BasicScraper):
stripUrl = url + '%s/' stripUrl = url + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://(?:www\.)?explosm\.net/db/files/[^"]+)', before="a daily webcomic")) imageSearch = compile(tagre("img", "src", r'(http://(?:www\.)?explosm\.net/db/files/[^"]+)', before="a daily webcomic"))
prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', before="prev")) prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', before="prev"))
noImageUrls = set(["http://www.explosm.net/comics/3082/"])
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
def shouldSkipUrl(self, url):
"""Skip pages without images."""
return url == "http://www.explosm.net/comics/3082/"
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):
imgname = imageUrl.split('/')[-1] imgname = imageUrl.split('/')[-1]

View file

@ -54,6 +54,15 @@ class DarkWings(_BasicScraper):
help = 'Index format: yyyy/mm/dd/page-nn-mm' help = 'Index format: yyyy/mm/dd/page-nn-mm'
class DeadWinter(_BasicScraper):
url = 'http://deadwinter.cc/'
stripUrl = url + 'page/%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r"(/static/page/strip/\d+[^']+)", quote="'"))
prevSearch = compile(tagre("a", "href", r'(/page/\d+)') + "Previous")
help = 'Index format: number'
class DeathToTheExtremist(_BasicScraper): class DeathToTheExtremist(_BasicScraper):
url = 'http://www.dtecomic.com/' url = 'http://www.dtecomic.com/'
stripUrl = url + '?n=%s' stripUrl = url + '?n=%s'

View file

@ -44,8 +44,8 @@ class FilibusterCartoons(_BasicScraper):
class FirstWorldProblems(_BasicScraper): class FirstWorldProblems(_BasicScraper):
url = 'http://bradcolbow.com/archive/C5/' url = 'http://bradcolbow.com/archive/C5/'
stripUrl = url + '%s' stripUrl = url + '%s/'
firstStripUrl = 'http://bradcolbow.com/archive/C5/P10/' firstStripUrl = stripUrl % 'P10'
imageSearch = compile(tagre("img", "src", r'(http://(?:fwpcomics\.s3\.amazonaws\.com|s3\.amazonaws\.com/fwpcomics)/s1-[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://(?:fwpcomics\.s3\.amazonaws\.com|s3\.amazonaws\.com/fwpcomics)/s1-[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://bradcolbow\.com/archive/C5/[^"]+)', before="prev")) prevSearch = compile(tagre("a", "href", r'(http://bradcolbow\.com/archive/C5/[^"]+)', before="prev"))
multipleImagesPerStrip = True multipleImagesPerStrip = True
@ -126,9 +126,9 @@ class Fallen(_BasicScraper):
part = pageUrl.split('-')[-1].split('.')[0] part = pageUrl.split('-')[-1].split('.')[0]
return '%s-%s' % (part, num) return '%s-%s' % (part, num)
def setStrip(self, index): def getIndexStripUrl(self, index):
index, part = index.split('-') index, part = index.split('-')
self.currentUrl = self.stripUrl % (part, index, part) return self.stripUrl % (part, index, part)
class FredoAndPidjin(_BasicScraper): class FredoAndPidjin(_BasicScraper):

View file

@ -3,6 +3,26 @@
from re import compile from re import compile
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..util import tagre from ..util import tagre
from ..helpers import bounceStarter
class HarkAVagrant(_BasicScraper):
url = 'http://www.harkavagrant.com/'
starter = bounceStarter(url,
compile(tagre("a", "href", r'(http://www\.harkavagrant\.com/index\.php\?id=\d+)') +
tagre("img", "src", "buttonnext.png")))
stripUrl = url + 'index.php?id=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(http://www.harkavagrant.com/[^"]+)', after='BORDER'))
prevSearch = compile(tagre("a", "href", r'(http://www\.harkavagrant\.com/index\.php\?id=\d+)') +
tagre("img", "src", "buttonprevious.png"))
help = 'Index format: number'
@classmethod
def namer(cls, imageUrl, pageUrl):
filename = imageUrl.rsplit('/', 1)[1]
num = pageUrl.rsplit('=', 1)[1]
return '%s-%s' % (num, filename)
class HijinksEnsue(_BasicScraper): class HijinksEnsue(_BasicScraper):

View file

@ -7,6 +7,15 @@ from ..scraper import _BasicScraper
from ..util import tagre from ..util import tagre
class IAmArg(_BasicScraper):
url = 'http://iamarg.com/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2011/05/08/05082011'
imageSearch = compile(tagre("img", "src", r'(http://iamarg\.com/comics/\d+-\d+-\d+[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://iamarg\.com/\d+/\d+/\d+/[^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/stripname'
class IanJay(_BasicScraper): class IanJay(_BasicScraper):
url = 'http://ianjay.net/' url = 'http://ianjay.net/'
stripUrl = url + '?p=%s' stripUrl = url + '?p=%s'

View file

@ -37,8 +37,8 @@ class KevinAndKell(_BasicScraper):
prevSearch = compile(r'<a.+?href="(/?(\.\./)?\d+/kk\d+\.html)"[^>]*><span>Previous Strip', IGNORECASE) prevSearch = compile(r'<a.+?href="(/?(\.\./)?\d+/kk\d+\.html)"[^>]*><span>Previous Strip', IGNORECASE)
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
def setStrip(self, index): def getIndexStripUrl(self, index):
self.currentUrl = self.stripUrl % tuple(map(int, index.split('-'))) return self.stripUrl % tuple(map(int, index.split('-')))
class KhaosKomix(_BasicScraper): class KhaosKomix(_BasicScraper):

View file

@ -24,6 +24,15 @@ class Lint(_BasicScraper):
help = 'Index format: yyyy/mm/dd/num-name' help = 'Index format: yyyy/mm/dd/num-name'
class LoadingArtist(_BasicScraper):
url = 'http://www.loadingartist.com/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2011/01/04/born'
imageSearch = compile(tagre("img", "src", r'(http://www\.loadingartist\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.loadingartist\.com/\d+/\d+/\d+/[^"]+/)', after="prev"))
help = 'Index format: yyyy/mm/dd/stripname'
class LookingForGroup(_BasicScraper): class LookingForGroup(_BasicScraper):
url = 'http://www.lfgcomic.com/' url = 'http://www.lfgcomic.com/'
stripUrl = url + 'page/%s/' stripUrl = url + 'page/%s/'
@ -33,6 +42,7 @@ class LookingForGroup(_BasicScraper):
nameSearch = compile(r'/page/(\d+)/') nameSearch = compile(r'/page/(\d+)/')
help = 'Index format: nnn' help = 'Index format: nnn'
@classmethod
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
return self.nameSearch.search(pageUrl).group(1) return self.nameSearch.search(pageUrl).group(1)

View file

@ -69,6 +69,15 @@ class Nicky510(_BasicScraper):
help = 'Index format: stripname' help = 'Index format: stripname'
class Nnewts(_BasicScraper):
url = 'http://nnewts.com/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'nnewts-page-1'
imageSearch = compile(tagre("img", "src", r'(http://nnewts\.com/newty/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://nnewts\.com/(?:nnewts-)?page-\d+/)', after="navi-prev"))
help = 'Index format: page-number'
class NoNeedForBushido(_BasicScraper): class NoNeedForBushido(_BasicScraper):
url = 'http://noneedforbushido.com/latest/' url = 'http://noneedforbushido.com/latest/'
stripUrl = 'http://noneedforbushido.com/%s/' stripUrl = 'http://noneedforbushido.com/%s/'

View file

@ -83,6 +83,17 @@ class PeppermintSaga(_BasicScraper):
help = 'Index format: number' help = 'Index format: number'
class PHDComics(_BasicScraper):
baseurl = 'http://phdcomics.com/'
url = baseurl + 'comics.php'
stripUrl = baseurl + 'comics/archive.php?comicid=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd[^ ]+)', quote=""))
prevSearch = compile(tagre("a", "href", r'((?:comics/)?archive\.php\?comicid=\d+)', quote="") +
tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
help = 'Index format: number'
class PicPakDog(_BasicScraper): class PicPakDog(_BasicScraper):
url = 'http://www.picpak.net/' url = 'http://www.picpak.net/'
stripUrl = url + 'comic/%s/' stripUrl = url + 'comic/%s/'
@ -117,6 +128,23 @@ class Pimpette(_BasicScraper):
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
class PokeyThePenguin(_BasicScraper):
baseurl = 'http://www.yellow5.com/pokey/archive/'
url = baseurl + 'index558.html'
stripUrl = baseurl + 'index%s.html'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(pokey\d+[^"]+)'))
multipleImagesPerStrip = True
help = 'Index format: number'
def getPrevUrl(self, url, data, baseUrl):
"""Decrease index.html number."""
mo = compile(r"index(\d+)\.html").search(url)
num = int(mo.group(1)) - 1
prefix = url.rsplit('/', 1)[0]
return "%s/index%d.html" % (prefix, num)
class Precocious(_BasicScraper): class Precocious(_BasicScraper):
url = 'http://www.precociouscomic.com/' url = 'http://www.precociouscomic.com/'
starter = indirectStarter(url, starter = indirectStarter(url,

View file

@ -5,7 +5,7 @@
from re import compile, MULTILINE, IGNORECASE, sub from re import compile, MULTILINE, IGNORECASE, sub
from os.path import splitext from os.path import splitext
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..helpers import indirectStarter from ..helpers import indirectStarter, bounceStarter
from ..util import tagre from ..util import tagre
@ -150,6 +150,31 @@ class SluggyFreelance(_BasicScraper):
help = 'Index format: yymmdd' help = 'Index format: yymmdd'
class SnowFlame(_BasicScraper):
url = 'http://www.snowflamecomic.com/'
stripUrl = url + '?comic=snowflame-%s-%s'
firstStripUrl = stripUrl % ('01', '01')
imageSearch = compile(tagre("img", "src", r'(http://www\.snowflamecomic\.com/wp-content/uploads/\d+/\d+/[^"]+)'))
prevSearch = compile(tagre("span", "class", "mininav-prev") +
tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)'))
starter = bounceStarter(url,
compile(tagre("span", "class", "mininav-next") +
tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)')))
help = 'Index format: chapter-page'
def getStripIndexUrl(self, index):
return self.stripUrl % index.split('-')
@classmethod
def namer(cls, imageUrl, pageUrl):
prefix, filename = imageUrl.rsplit('/', 1)
ro = compile(r'snowflame-([^-]+)-([^-]+)')
mo = ro.search(pageUrl)
chapter = mo.group(1)
page = mo.group(2)
return "%s-%s-%s" % (chapter, page, filename)
class SodiumEyes(_BasicScraper): class SodiumEyes(_BasicScraper):
url = 'http://sodiumeyes.com/' url = 'http://sodiumeyes.com/'
stripUrl = url + '%s/' stripUrl = url + '%s/'

View file

@ -57,6 +57,40 @@ class Wonderella(_BasicScraper):
help = 'Index format: yyyy/mm/dd/name' help = 'Index format: yyyy/mm/dd/name'
class WorldOfMrToast(_BasicScraper):
baseurl = 'http://www.theimaginaryworld.com/'
url = baseurl + 'mrTcomicA.html'
stripUrl = baseurl + '%s.html'
imageSearch = compile(tagre("img", "src", r'(comic[^"]+)'))
# list the archive links since there is no prev/next navigation
prevurls = (
url,
baseurl + 'mrTcomicW02.html',
baseurl + 'mrTcomicW01.html',
baseurl + 'mrGcomic03.html',
baseurl + 'mrGcomic02.html',
baseurl + 'mrGcomic01.html',
baseurl + 'mrTcomicT05.html',
baseurl + 'mrTcomicT04.html',
baseurl + 'mrTcomicT03.html',
baseurl + 'mrTcomicT02.html',
baseurl + 'mrTcomicT01.html',
baseurl + 'mrTcomicIW3.html',
baseurl + 'mrTcomicIW2.html',
baseurl + 'mrTcomicIW1.html',
)
firstStripUrl = prevurls[-1]
multipleImagesPerStrip = True
help = 'Index format: none'
def getPrevUrl(self, url, data, baseUrl):
idx = self.prevurls.index(url)
try:
return self.prevurls[idx+1]
except IndexError:
return None
class WotNow(_BasicScraper): class WotNow(_BasicScraper):
url = 'http://shadowburn.binmode.com/wotnow/' url = 'http://shadowburn.binmode.com/wotnow/'
stripUrl = url + 'comic.php?comic_id=%s' stripUrl = url + 'comic.php?comic_id=%s'

View file

@ -33,9 +33,6 @@ class _BasicScraper(object):
# if more than one image per URL is expected # if more than one image per URL is expected
multipleImagesPerStrip = False multipleImagesPerStrip = False
# set of URLs that have no image (eg. only a video link)
noImageUrls = set()
# set to False if previous URLs do not match the strip URL (ie. because of redirects) # set to False if previous URLs do not match the strip URL (ie. because of redirects)
prevUrlMatchesStripUrl = True prevUrlMatchesStripUrl = True
@ -55,7 +52,7 @@ class _BasicScraper(object):
"""Initialize internal variables.""" """Initialize internal variables."""
self.urls = set() self.urls = set()
if indexes: if indexes:
self.indexes = tuple(indexes) self.indexes = tuple(sorted(indexes))
else: else:
self.indexes = tuple() self.indexes = tuple()
self.skippedUrls = set() self.skippedUrls = set()
@ -66,7 +63,7 @@ class _BasicScraper(object):
if not isinstance(other, _BasicScraper): if not isinstance(other, _BasicScraper):
return 1 return 1
# first, order by name # first, order by name
d = cmp(self.get_name(), other.get_name()) d = cmp(self.getName(), other.getName())
if d != 0: if d != 0:
return d return d
# then by indexes # then by indexes
@ -74,65 +71,41 @@ class _BasicScraper(object):
def __hash__(self): def __hash__(self):
"""Get hash value from name and index list.""" """Get hash value from name and index list."""
return hash((self.get_name(), self.indexes)) return hash((self.getName(), self.indexes))
def getCurrentStrips(self): def shouldSkipUrl(self, url):
"""Get current comic strip.""" """Determine if search for images in given URL should be skipped."""
msg = 'Retrieving the current strip' return False
if self.indexes:
msg += " for indexes %s" % self.indexes
out.info(msg+"...")
if self.indexes:
for index in self.indexes:
url = self.stripUrl % index
if url in self.noImageUrls:
self.skipUrl(url)
else:
yield self.getStrip(url)
else:
url = self.getLatestUrl()
if url in self.noImageUrls:
self.skipUrl(url)
else:
yield self.getStrip(self.getLatestUrl())
def skipUrl(self, url):
"""Document that an URL had no images."""
out.info('Skipping URL %s without image' % url)
self.skippedUrls.add(url)
def getStrip(self, url):
"""Get comic strip for given URL."""
data, baseUrl = getPageContent(url, self.session)
return self.getComicStrip(url, data, baseUrl)
def getComicStrip(self, url, data, baseUrl): def getComicStrip(self, url, data, baseUrl):
"""Get comic strip downloader for given URL and data.""" """Get comic strip downloader for given URL and data."""
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch) imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
imageUrls = set(map(self.imageUrlModifier, imageUrls)) imageUrls = set(map(self.imageUrlModifier, imageUrls))
if len(imageUrls) > 1 and not self.multipleImagesPerStrip: if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern)) out.warn("found %d images instead of 1 at %s with %s" % (len(imageUrls), url, self.imageSearch.pattern))
return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session) elif not imageUrls:
out.warn("found no images at %s with %s" % (url, self.imageSearch.pattern))
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
def getAllStrips(self, maxstrips=None): def getStrips(self, maxstrips=None):
"""Get all comic strips.""" """Get comic strips."""
if maxstrips: if maxstrips:
msg = 'Retrieving %d strips' % maxstrips word = "strip" if maxstrips == 1 else "strips"
msg = 'Retrieving %d %s' % (maxstrips, word)
else: else:
msg = 'Retrieving all strips' msg = 'Retrieving all strips'
if self.indexes: if self.indexes:
if len(self.indexes) == 1:
msg += " for index %s" % self.indexes[0]
else:
msg += " for indexes %s" % self.indexes msg += " for indexes %s" % self.indexes
urls = [self.getIndexStripUrl(index) for index in self.indexes]
else:
urls = [self.getLatestUrl()]
if self.adult: if self.adult:
msg += " (including adult content)" msg += " (including adult content)"
out.info(msg) out.info(msg)
if self.indexes: for url in urls:
for index in self.indexes:
url = self.stripUrl % index
for strip in self.getStripsFor(url, maxstrips):
yield strip
else:
url = self.getLatestUrl()
for strip in self.getStripsFor(url, maxstrips): for strip in self.getStripsFor(url, maxstrips):
yield strip yield strip
@ -142,15 +115,31 @@ class _BasicScraper(object):
self.hitFirstStripUrl = False self.hitFirstStripUrl = False
seen_urls = set() seen_urls = set()
while url: while url:
out.info('Get strip URL %s' % url, level=1)
data, baseUrl = getPageContent(url, self.session) data, baseUrl = getPageContent(url, self.session)
if url in self.noImageUrls: if self.shouldSkipUrl(url):
self.skipUrl(url) out.info('Skipping URL %s' % url)
self.skippedUrls.add(url)
else: else:
yield self.getComicStrip(url, data, baseUrl) yield self.getComicStrip(url, data, baseUrl)
if self.firstStripUrl == url: if self.firstStripUrl == url:
out.debug("Stop at first URL %s" % url) out.debug("Stop at first URL %s" % url)
self.hitFirstStripUrl = True self.hitFirstStripUrl = True
break break
if maxstrips is not None:
maxstrips -= 1
if maxstrips <= 0:
break
prevUrl = self.getPrevUrl(url, data, baseUrl)
seen_urls.add(url)
if prevUrl in seen_urls:
# avoid recursive URL loops
out.warn("Already seen previous URL %r" % prevUrl)
break
url = prevUrl
def getPrevUrl(self, url, data, baseUrl):
"""Find previous URL."""
prevUrl = None prevUrl = None
if self.prevSearch: if self.prevSearch:
try: try:
@ -161,23 +150,14 @@ class _BasicScraper(object):
else: else:
prevUrl = self.prevUrlModifier(prevUrl) prevUrl = self.prevUrlModifier(prevUrl)
out.debug("Matched previous URL %s" % prevUrl) out.debug("Matched previous URL %s" % prevUrl)
seen_urls.add(url) return prevUrl
if prevUrl in seen_urls:
# avoid recursive URL loops
out.warn("Already seen previous URL %r" % prevUrl)
break
url = prevUrl
if maxstrips is not None:
maxstrips -= 1
if maxstrips <= 0:
break
def setStrip(self, index): def getIndexStripUrl(self, index):
"""Set current comic strip URL.""" """Get comic strip URL from index."""
self.currentUrl = self.stripUrl % index return self.stripUrl % index
@classmethod @classmethod
def get_name(cls): def getName(cls):
"""Get scraper name.""" """Get scraper name."""
if hasattr(cls, 'name'): if hasattr(cls, 'name'):
return cls.name return cls.name
@ -209,10 +189,6 @@ class _BasicScraper(object):
""" """
return imageUrl return imageUrl
def getFilename(self, imageUrl, pageUrl):
"""Return filename for given image and page URL."""
return self.namer(imageUrl, pageUrl)
def getLatestUrl(self): def getLatestUrl(self):
"""Get starter URL from where to scrape comic strips.""" """Get starter URL from where to scrape comic strips."""
return self.starter() return self.starter()
@ -227,7 +203,7 @@ def find_scraperclasses(comic, multiple_allowed=False):
candidates = [] candidates = []
cname = comic.lower() cname = comic.lower()
for scraperclass in get_scraperclasses(): for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower() lname = scraperclass.getName().lower()
if lname == cname: if lname == cname:
# perfect match # perfect match
if not multiple_allowed: if not multiple_allowed:
@ -237,7 +213,7 @@ def find_scraperclasses(comic, multiple_allowed=False):
elif cname in lname: elif cname in lname:
candidates.append(scraperclass) candidates.append(scraperclass)
if len(candidates) > 1 and not multiple_allowed: if len(candidates) > 1 and not multiple_allowed:
comics = ", ".join(x.get_name() for x in candidates) comics = ", ".join(x.getName() for x in candidates)
raise ValueError('multiple comics found: %s' % comics) raise ValueError('multiple comics found: %s' % comics)
elif not candidates: elif not candidates:
raise ValueError('comic %r not found' % comic) raise ValueError('comic %r not found' % comic)
@ -266,10 +242,10 @@ def check_scrapers():
"""Check for duplicate scraper class names.""" """Check for duplicate scraper class names."""
d = {} d = {}
for scraperclass in _scraperclasses: for scraperclass in _scraperclasses:
name = scraperclass.get_name().lower() name = scraperclass.getName().lower()
if name in d: if name in d:
name1 = scraperclass.get_name() name1 = scraperclass.getName()
name2 = d[name].get_name() name2 = d[name].getName()
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2)) raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
d[name] = scraperclass d[name] = scraperclass

View file

@ -65,7 +65,7 @@ def has_comic(name):
("SmackJeeves/%s" % name).lower(), ("SmackJeeves/%s" % name).lower(),
] ]
for scraperclass in get_scraperclasses(): for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower() lname = scraperclass.getName().lower()
if lname in names: if lname in names:
return True return True
return False return False

View file

@ -275,7 +275,7 @@ def has_comic(name):
("Arcamax/%s" % name).lower(), ("Arcamax/%s" % name).lower(),
] ]
for scraperclass in get_scraperclasses(): for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower() lname = scraperclass.getName().lower()
if lname in names: if lname in names:
return True return True
return False return False

View file

@ -83,7 +83,7 @@ def has_creators_comic(name):
"""Test if comic name already exists.""" """Test if comic name already exists."""
cname = "Creators/%s" % name cname = "Creators/%s" % name
for scraperclass in get_scraperclasses(): for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower() lname = scraperclass.getName().lower()
if lname == cname.lower(): if lname == cname.lower():
return True return True
return False return False

View file

@ -407,7 +407,7 @@ def has_comic(name):
cname = ("Creators/%s" % name).lower() cname = ("Creators/%s" % name).lower()
gname = ("GoComics/%s" % name).lower() gname = ("GoComics/%s" % name).lower()
for scraperclass in get_scraperclasses(): for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower() lname = scraperclass.getName().lower()
if lname == cname or lname == gname: if lname == cname or lname == gname:
return True return True
return False return False

View file

@ -291,7 +291,7 @@ def has_comic(name):
"""Check if comic name already exists.""" """Check if comic name already exists."""
cname = name.lower() cname = name.lower()
for scraperclass in get_scraperclasses(): for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower() lname = scraperclass.getName().lower()
if lname == cname: if lname == cname:
return True return True
return False return False

View file

@ -1,6 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
# Dosage, the webcomic downloader
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam # Copyright (C) 2012-2013 Bastian Kleineidam
from __future__ import print_function from __future__ import print_function
@ -394,7 +393,8 @@ class MyRegister (register, object):
args = dict( args = dict(
name = AppName, name = AppName,
version = AppVersion, version = AppVersion,
description = 'a commandline webcomic downloader and archiver', description = 'a commandline comic downloader and archiver',
keywords = 'comic,webcomic,downloader,archiver',
author = 'Tristan Seligmann, Jonathan Jacobs, Bastian Kleineidam', author = 'Tristan Seligmann, Jonathan Jacobs, Bastian Kleineidam',
author_email = 'bastian.kleineidam@web.de', author_email = 'bastian.kleineidam@web.de',
maintainer = 'Bastian Kleineidam', maintainer = 'Bastian Kleineidam',

View file

@ -8,7 +8,7 @@ class TestComicNames(TestCase):
def test_names(self): def test_names(self):
for scraperclass in scraper.get_scraperclasses(): for scraperclass in scraper.get_scraperclasses():
name = scraperclass.get_name() name = scraperclass.getName()
self.assertTrue(name.count('/') <= 1, name) self.assertTrue(name.count('/') <= 1, name)
if '/' in name: if '/' in name:
comicname = name.split('/')[1] comicname = name.split('/')[1]

View file

@ -34,7 +34,7 @@ class _ComicTester(TestCase):
scraperclass=None scraperclass=None
def setUp(self): def setUp(self):
self.name = self.scraperclass.get_name() self.name = self.scraperclass.getName()
self.url = self.scraperclass.starter() self.url = self.scraperclass.starter()
# create a temporary directory for images # create a temporary directory for images
self.tmpdir = tempfile.mkdtemp() self.tmpdir = tempfile.mkdtemp()
@ -64,7 +64,7 @@ class _ComicTester(TestCase):
def _test_comic(self, scraperobj): def _test_comic(self, scraperobj):
num = 0 num = 0
max_strips = 5 max_strips = 5
for strip in islice(scraperobj.getAllStrips(), 0, max_strips): for strip in scraperobj.getStrips(max_strips):
images = [] images = []
for image in strip.getImages(): for image in strip.getImages():
images.append(image.url) images.append(image.url)
@ -122,7 +122,7 @@ def generate_comic_testers():
g = globals() g = globals()
if "TRAVIS" in os.environ: if "TRAVIS" in os.environ:
# Get limited number of scraper tests on Travis builds. # Get limited number of scraper tests on Travis builds.
max_scrapers = 1500 max_scrapers = 500
scraperclasses = islice(scraper.get_scraperclasses(), 0, max_scrapers) scraperclasses = islice(scraper.get_scraperclasses(), 0, max_scrapers)
else: else:
scraperclasses = scraper.get_scraperclasses() scraperclasses = scraper.get_scraperclasses()

View file

@ -52,6 +52,11 @@ class TestDosage (unittest.TestCase):
self.assertRaises(OSError, run_with_options, []) self.assertRaises(OSError, run_with_options, [])
self.assertRaises(OSError, run_with_options, ['--imadoofus']) self.assertRaises(OSError, run_with_options, ['--imadoofus'])
def test_fetch(self): def test_fetch_html(self):
run_with_options(["-n", "2", "-b", self.tmpdir, "-o", "html", "calvinandhobbes"]) run_with_options(["-n", "2", "-b", self.tmpdir, "-o", "html", "calvinandhobbes"])
def test_fetch_rss(self):
run_with_options(["--numstrips", "2", "--baseurl", "bla", "--basepath", self.tmpdir, "--output", "rss", "--adult", "sexyloser"]) run_with_options(["--numstrips", "2", "--baseurl", "bla", "--basepath", self.tmpdir, "--output", "rss", "--adult", "sexyloser"])
def test_fetch_indexed(self):
run_with_options(["-n", "2", "-b", self.tmpdir, "calvinandhobbes:2012/02/02"])