Added some comic strips and cleanup the scraper code.

This commit is contained in:
Bastian Kleineidam 2013-03-06 20:00:30 +01:00
parent 6091138481
commit bae2a96d8b
31 changed files with 296 additions and 128 deletions

View file

@ -1,14 +1,14 @@
Dosage
=======
Dosage is a commandline webcomic downloader and archiver.
Dosage is a commandline comic downloader and archiver.
Introduction
-------------
Dosage is designed to keep a local copy of specific webcomics
and other picture-based content such as Picture of the Day sites.
With the dosage commandline script you can get the latest strip of
webcomic, or catch-up to the last strip downloaded, or download a
a webcomic, or catch-up to the last strip downloaded, or download a
strip for a particular date/index (if the webcomic's site layout
makes this possible).
@ -91,7 +91,7 @@ Technical Description
Dosage is written in Python and relies on regular expressions to
do most of the grunt work.
For each webcomic Dosage has a plugin module, found in the "plugins"
For each comic Dosage has a plugin module, found in the "plugins"
subdirectory of the dosagelib directory. Each module is a subclass of
the _BasicComic class and specifies where to download its comic images.
Some comic syndicates (GoComics for example) have a standard layout for all
@ -100,7 +100,7 @@ instances from a given list of comic strips.
Extending Dosage
-----------------
In order to add a new webcomic, a new module class has to be created in
In order to add a new comic, a new module class has to be created in
one of the *.py files in the dosagelib/plugins subdirectory.
Look at the existing module classes for examples.

View file

@ -1,5 +1,10 @@
Dosage 1.13 (released xx.xx.2013)
Features:
- comics: Added comic strips AxeCop, Bearmageddon, DeadWinter,
HarkAVagrant, IAmArg, LoadingArtist, Nnewts, PHDComics, PokeyThePenguin,
SnowFlame and WorldOfMrToast.
Fixes:
- cmdline: Catch error when piping output to another
program or file under Windows.

View file

@ -1,6 +1,6 @@
.TH DOSAGE 1
.SH NAME
dosage \- a commandline webcomic downloader and archiver
dosage \- a commandline comic downloader and archiver
.SH SYNOPSIS
\fBdosage\fP [\fIoptions\fP] \fImodule\fP...
.SH DESCRIPTION
@ -128,7 +128,7 @@ Retrieve the Penny Arcade strip for a given index:
Retrieve Calvin and Hobbes strips from a given index going backwards to
the beginning.
.RS
.B dosage \-a calvinandhobbes:20120722
.B dosage \-a calvinandhobbes:2012/07/22
.RE
.PP
On Unix, \fBxargs(1)\fP can download several comic strips in parallel,

View file

@ -9,7 +9,7 @@ Section: User Commands (1)<BR><A HREF="#index">Index</A>
<A NAME="lbAB">&nbsp;</A>
<H2>NAME</H2>
dosage - a commandline webcomic downloader and archiver
dosage - a commandline comic downloader and archiver
<A NAME="lbAC">&nbsp;</A>
<H2>SYNOPSIS</H2>
@ -174,7 +174,7 @@ Retrieve the Penny Arcade strip for a given index:
Retrieve Calvin and Hobbes strips from a given index going backwards to
the beginning.
<DL COMPACT><DT><DD>
<B>dosage -a calvinandhobbes:20120722</B>
<B>dosage -a calvinandhobbes:2012/07/22</B>
</DL>

View file

@ -3,7 +3,7 @@ DOSAGE(1) DOSAGE(1)
NAME
dosage - a commandline webcomic downloader and archiver
dosage - a commandline comic downloader and archiver
SYNOPSIS
dosage [options] module...
@ -116,7 +116,7 @@ EXAMPLES
Retrieve Calvin and Hobbes strips from a given index going
backwards to the beginning.
dosage -a calvinandhobbes:20120722
dosage -a calvinandhobbes:2012/07/22
On Unix, xargs(1) can download several comic strips in paral
lel, for example using up to 4 processes:

24
dosage
View file

@ -1,6 +1,5 @@
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
# Dosage, the webcomic downloader
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam
from __future__ import print_function
@ -56,7 +55,7 @@ def setupOptions():
@rtype argparse.ArgumentParser
"""
kwargs = dict(
description = "A commandline webcomic downloader and archiver.",
description = "A commandline comic downloader and archiver.",
epilog = Examples,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
@ -131,7 +130,7 @@ def displayHelp(comics):
def displayComicHelp(scraperobj):
"""Print description and help for a comic."""
out.context = scraperobj.get_name()
out.context = scraperobj.getName()
try:
if scraperobj.description:
out.info("Description: " + scraperobj.description)
@ -163,15 +162,16 @@ def getComics(options):
def getStrips(scraperobj, options):
"""Get all strips from a scraper."""
errors = 0
out.context = scraperobj.get_name()
out.context = scraperobj.getName()
if options.all:
strips = scraperobj.getAllStrips()
numstrips = None
elif options.numstrips:
strips = scraperobj.getAllStrips(options.numstrips)
numstrips = options.numstrips
else:
strips = scraperobj.getCurrentStrips()
# get current strip
numstrips = 1
try:
for strip in strips:
for strip in scraperobj.getStrips(numstrips):
_errors, skipped = saveComicStrip(strip, options.basepath)
errors += _errors
if skipped and options.cont:
@ -206,7 +206,7 @@ def doList(columnList=True):
"""List available comics."""
out.info('Available comic scrapers:')
out.info('Comics marked with [A] require age confirmation with the --adult option.')
scrapers = sorted(getScrapers(['@@']), key=lambda s: s.get_name())
scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName())
try:
if columnList:
num = doColumnList(scrapers)
@ -243,7 +243,7 @@ def doColumnList(scrapers):
def getScraperName(scraperobj, limit=None):
"""Get comic scraper name."""
suffix = " [A]" if scraperobj.adult else ""
name = scraperobj.get_name()
name = scraperobj.getName()
if limit is not None:
name = strlimit(name, limit)
return name + suffix
@ -259,7 +259,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
if not adult and scraperclass.adult:
warn_adult(scraperclass)
continue
dirname = getDirname(scraperclass.get_name())
dirname = getDirname(scraperclass.getName())
if os.path.isdir(os.path.join(basepath, dirname)):
yield scraperclass()
elif '@@' in comics:
@ -293,7 +293,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
def warn_adult(scraperclass):
"""Print warning about adult content."""
out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.get_name())
out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())
def main():

View file

@ -1,8 +1,8 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
# Copyright (C) 2012-2013 Bastian Kleineidam
"""
Automated webcomic downloader. Dosage traverses webcomic websites in
Automated comic downloader. Dosage traverses comic websites in
order to download each strip of the comic. The intended use is for
mirroring the strips locally for ease of viewing; redistribution of the
downloaded strips may violate copyright, and is not advisable unless you
@ -11,7 +11,7 @@ your intentions, and received permission to distribute.
The primary dosage interface is currently the 'mainline' script, which
is just a thin wrapper that invokes L{dosage.mainline}. Comic modules
for each webcomic are located in L{dosage.modules}; most of these make
for each comic are located in L{dosage.modules}; most of these make
use of the helper base classes and mixins in L{dosage.modules.helpers},
thus making their individual implementations trivial.
"""

View file

@ -88,6 +88,7 @@ class ComicImage(object):
def save(self, basepath):
"""Save comic URL to filename on disk."""
out.info("Get image URL %s" % self.url, level=1)
self.connect()
filename = "%s%s" % (self.filename, self.ext)
comicSize = self.contentLength
@ -96,6 +97,7 @@ class ComicImage(object):
os.makedirs(comicDir)
fn = os.path.join(comicDir, filename)
# compare with >= since comicSize could be the compressed size
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
self.touch(fn)
out.info('Skipping existing file "%s".' % fn)

View file

@ -159,14 +159,17 @@ class AstronomyPOTD(_BasicScraper):
stripUrl = 'http://antwrp.gsfc.nasa.gov/apod/ap%s.html'
imageSearch = compile(tagre("a", "href", r'(image/\d{4}/[^"]+)'))
multipleImagesPerStrip = True
noImageUrls = set([
'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video
'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video
'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video
])
prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "&lt;</a>")
help = 'Index format: yymmdd'
def shouldSkipUrl(self, url):
"""Skip pages without images."""
return url in (
'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video
'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video
'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video
)
@classmethod
def namer(cls, imageUrl, pageUrl):
return '%s-%s' % (pageUrl.split('/')[-1].split('.')[0][2:],
@ -269,3 +272,14 @@ class Annyseed(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(Annyseed[^"]+)'))
prevSearch = compile(r'<a href="(http://www\.colourofivy\.com/[^"]+)"><img src="Last.gif"')
help = 'Index format: nnn'
class AxeCop(_BasicScraper):
url = 'http://axecop.com/'
starter = indirectStarter(url, compile(tagre("a", "href", r'(http://axecop\.com/index\.php/acepisodes/read/episode_\d+/)')))
stripUrl = url + 'index.php/acepisodes/read/episode_%s/'
firstStripUrl = stripUrl % '0'
imageSearch = compile(tagre("img", "src", r'(http://axecop\.com/images/uploads/axecop[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://axecop\.com/index\.php/acepisodes/read/episode_\d+/)') +
tagre("img", "src", r'http://axecop\.com/acimages/buttons/page_left\.png'))
help = 'Index format: number'

View file

@ -25,6 +25,15 @@ class Bardsworth(_BasicScraper):
help = 'Index format: nnn'
class Bearmageddon(_BasicScraper):
url = 'http://bearmageddon.com/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2011/08/01/page-1'
imageSearch = compile(tagre("img", "src", r'(http://bearmageddon\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://bearmageddon\.com/\d+/\d+/\d+/[^"]+)', after='navi-prev'))
help = 'Index format: yyyy/mm/dd/stripname'
class BetterDays(_BasicScraper):
url = 'http://jaynaylor.com/betterdays/'
stripUrl = url + 'archives/%s.html'
@ -119,6 +128,16 @@ class BrentalFlossGuest(BrentalFloss):
stripUrl = url + '?id=%s'
# XXX disallowed by robots.txt
class _BringBackRoomies(_BasicScraper):
url = "http://www.bringbackroomies.com/"
stripUrl = url + "comic/%s"
imageSearch = compile(tagre("img", "src", r'(http://www\.bringbackroomies\.com/wp-content/uploads/\d+/\d+/[^"]+)'))
prevSearch = compile(tagre("span", "class", "mininav-prev") +
tagre("a", "href", r'(http://www\.bringbackroomies\.com/comic/[^"]+)'))
help = 'Index format: stripname'
class Brink(_BasicScraper):
url = 'http://paperfangs.com/brink/'
stripUrl = url + '?p=%s'

View file

@ -209,9 +209,12 @@ class CyanideAndHappiness(_BasicScraper):
stripUrl = url + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://(?:www\.)?explosm\.net/db/files/[^"]+)', before="a daily webcomic"))
prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', before="prev"))
noImageUrls = set(["http://www.explosm.net/comics/3082/"])
help = 'Index format: n (unpadded)'
def shouldSkipUrl(self, url):
"""Skip pages without images."""
return url == "http://www.explosm.net/comics/3082/"
@classmethod
def namer(cls, imageUrl, pageUrl):
imgname = imageUrl.split('/')[-1]

View file

@ -54,6 +54,15 @@ class DarkWings(_BasicScraper):
help = 'Index format: yyyy/mm/dd/page-nn-mm'
class DeadWinter(_BasicScraper):
url = 'http://deadwinter.cc/'
stripUrl = url + 'page/%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r"(/static/page/strip/\d+[^']+)", quote="'"))
prevSearch = compile(tagre("a", "href", r'(/page/\d+)') + "Previous")
help = 'Index format: number'
class DeathToTheExtremist(_BasicScraper):
url = 'http://www.dtecomic.com/'
stripUrl = url + '?n=%s'

View file

@ -44,8 +44,8 @@ class FilibusterCartoons(_BasicScraper):
class FirstWorldProblems(_BasicScraper):
url = 'http://bradcolbow.com/archive/C5/'
stripUrl = url + '%s'
firstStripUrl = 'http://bradcolbow.com/archive/C5/P10/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'P10'
imageSearch = compile(tagre("img", "src", r'(http://(?:fwpcomics\.s3\.amazonaws\.com|s3\.amazonaws\.com/fwpcomics)/s1-[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://bradcolbow\.com/archive/C5/[^"]+)', before="prev"))
multipleImagesPerStrip = True
@ -126,9 +126,9 @@ class Fallen(_BasicScraper):
part = pageUrl.split('-')[-1].split('.')[0]
return '%s-%s' % (part, num)
def setStrip(self, index):
def getIndexStripUrl(self, index):
index, part = index.split('-')
self.currentUrl = self.stripUrl % (part, index, part)
return self.stripUrl % (part, index, part)
class FredoAndPidjin(_BasicScraper):

View file

@ -3,6 +3,26 @@
from re import compile
from ..scraper import _BasicScraper
from ..util import tagre
from ..helpers import bounceStarter
class HarkAVagrant(_BasicScraper):
url = 'http://www.harkavagrant.com/'
starter = bounceStarter(url,
compile(tagre("a", "href", r'(http://www\.harkavagrant\.com/index\.php\?id=\d+)') +
tagre("img", "src", "buttonnext.png")))
stripUrl = url + 'index.php?id=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(http://www.harkavagrant.com/[^"]+)', after='BORDER'))
prevSearch = compile(tagre("a", "href", r'(http://www\.harkavagrant\.com/index\.php\?id=\d+)') +
tagre("img", "src", "buttonprevious.png"))
help = 'Index format: number'
@classmethod
def namer(cls, imageUrl, pageUrl):
filename = imageUrl.rsplit('/', 1)[1]
num = pageUrl.rsplit('=', 1)[1]
return '%s-%s' % (num, filename)
class HijinksEnsue(_BasicScraper):

View file

@ -7,6 +7,15 @@ from ..scraper import _BasicScraper
from ..util import tagre
class IAmArg(_BasicScraper):
url = 'http://iamarg.com/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2011/05/08/05082011'
imageSearch = compile(tagre("img", "src", r'(http://iamarg\.com/comics/\d+-\d+-\d+[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://iamarg\.com/\d+/\d+/\d+/[^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/stripname'
class IanJay(_BasicScraper):
url = 'http://ianjay.net/'
stripUrl = url + '?p=%s'

View file

@ -37,8 +37,8 @@ class KevinAndKell(_BasicScraper):
prevSearch = compile(r'<a.+?href="(/?(\.\./)?\d+/kk\d+\.html)"[^>]*><span>Previous Strip', IGNORECASE)
help = 'Index format: yyyy-mm-dd'
def setStrip(self, index):
self.currentUrl = self.stripUrl % tuple(map(int, index.split('-')))
def getIndexStripUrl(self, index):
return self.stripUrl % tuple(map(int, index.split('-')))
class KhaosKomix(_BasicScraper):

View file

@ -24,6 +24,15 @@ class Lint(_BasicScraper):
help = 'Index format: yyyy/mm/dd/num-name'
class LoadingArtist(_BasicScraper):
url = 'http://www.loadingartist.com/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2011/01/04/born'
imageSearch = compile(tagre("img", "src", r'(http://www\.loadingartist\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.loadingartist\.com/\d+/\d+/\d+/[^"]+/)', after="prev"))
help = 'Index format: yyyy/mm/dd/stripname'
class LookingForGroup(_BasicScraper):
url = 'http://www.lfgcomic.com/'
stripUrl = url + 'page/%s/'
@ -33,6 +42,7 @@ class LookingForGroup(_BasicScraper):
nameSearch = compile(r'/page/(\d+)/')
help = 'Index format: nnn'
@classmethod
def namer(self, imageUrl, pageUrl):
return self.nameSearch.search(pageUrl).group(1)

View file

@ -69,6 +69,15 @@ class Nicky510(_BasicScraper):
help = 'Index format: stripname'
class Nnewts(_BasicScraper):
url = 'http://nnewts.com/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'nnewts-page-1'
imageSearch = compile(tagre("img", "src", r'(http://nnewts\.com/newty/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://nnewts\.com/(?:nnewts-)?page-\d+/)', after="navi-prev"))
help = 'Index format: page-number'
class NoNeedForBushido(_BasicScraper):
url = 'http://noneedforbushido.com/latest/'
stripUrl = 'http://noneedforbushido.com/%s/'

View file

@ -83,6 +83,17 @@ class PeppermintSaga(_BasicScraper):
help = 'Index format: number'
class PHDComics(_BasicScraper):
baseurl = 'http://phdcomics.com/'
url = baseurl + 'comics.php'
stripUrl = baseurl + 'comics/archive.php?comicid=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd[^ ]+)', quote=""))
prevSearch = compile(tagre("a", "href", r'((?:comics/)?archive\.php\?comicid=\d+)', quote="") +
tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
help = 'Index format: number'
class PicPakDog(_BasicScraper):
url = 'http://www.picpak.net/'
stripUrl = url + 'comic/%s/'
@ -117,6 +128,23 @@ class Pimpette(_BasicScraper):
help = 'Index format: yyyymmdd'
class PokeyThePenguin(_BasicScraper):
baseurl = 'http://www.yellow5.com/pokey/archive/'
url = baseurl + 'index558.html'
stripUrl = baseurl + 'index%s.html'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(pokey\d+[^"]+)'))
multipleImagesPerStrip = True
help = 'Index format: number'
def getPrevUrl(self, url, data, baseUrl):
"""Decrease index.html number."""
mo = compile(r"index(\d+)\.html").search(url)
num = int(mo.group(1)) - 1
prefix = url.rsplit('/', 1)[0]
return "%s/index%d.html" % (prefix, num)
class Precocious(_BasicScraper):
url = 'http://www.precociouscomic.com/'
starter = indirectStarter(url,

View file

@ -5,7 +5,7 @@
from re import compile, MULTILINE, IGNORECASE, sub
from os.path import splitext
from ..scraper import _BasicScraper
from ..helpers import indirectStarter
from ..helpers import indirectStarter, bounceStarter
from ..util import tagre
@ -150,6 +150,31 @@ class SluggyFreelance(_BasicScraper):
help = 'Index format: yymmdd'
class SnowFlame(_BasicScraper):
url = 'http://www.snowflamecomic.com/'
stripUrl = url + '?comic=snowflame-%s-%s'
firstStripUrl = stripUrl % ('01', '01')
imageSearch = compile(tagre("img", "src", r'(http://www\.snowflamecomic\.com/wp-content/uploads/\d+/\d+/[^"]+)'))
prevSearch = compile(tagre("span", "class", "mininav-prev") +
tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)'))
starter = bounceStarter(url,
compile(tagre("span", "class", "mininav-next") +
tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)')))
help = 'Index format: chapter-page'
def getStripIndexUrl(self, index):
return self.stripUrl % index.split('-')
@classmethod
def namer(cls, imageUrl, pageUrl):
prefix, filename = imageUrl.rsplit('/', 1)
ro = compile(r'snowflame-([^-]+)-([^-]+)')
mo = ro.search(pageUrl)
chapter = mo.group(1)
page = mo.group(2)
return "%s-%s-%s" % (chapter, page, filename)
class SodiumEyes(_BasicScraper):
url = 'http://sodiumeyes.com/'
stripUrl = url + '%s/'

View file

@ -57,6 +57,40 @@ class Wonderella(_BasicScraper):
help = 'Index format: yyyy/mm/dd/name'
class WorldOfMrToast(_BasicScraper):
baseurl = 'http://www.theimaginaryworld.com/'
url = baseurl + 'mrTcomicA.html'
stripUrl = baseurl + '%s.html'
imageSearch = compile(tagre("img", "src", r'(comic[^"]+)'))
# list the archive links since there is no prev/next navigation
prevurls = (
url,
baseurl + 'mrTcomicW02.html',
baseurl + 'mrTcomicW01.html',
baseurl + 'mrGcomic03.html',
baseurl + 'mrGcomic02.html',
baseurl + 'mrGcomic01.html',
baseurl + 'mrTcomicT05.html',
baseurl + 'mrTcomicT04.html',
baseurl + 'mrTcomicT03.html',
baseurl + 'mrTcomicT02.html',
baseurl + 'mrTcomicT01.html',
baseurl + 'mrTcomicIW3.html',
baseurl + 'mrTcomicIW2.html',
baseurl + 'mrTcomicIW1.html',
)
firstStripUrl = prevurls[-1]
multipleImagesPerStrip = True
help = 'Index format: none'
def getPrevUrl(self, url, data, baseUrl):
idx = self.prevurls.index(url)
try:
return self.prevurls[idx+1]
except IndexError:
return None
class WotNow(_BasicScraper):
url = 'http://shadowburn.binmode.com/wotnow/'
stripUrl = url + 'comic.php?comic_id=%s'

View file

@ -33,9 +33,6 @@ class _BasicScraper(object):
# if more than one image per URL is expected
multipleImagesPerStrip = False
# set of URLs that have no image (eg. only a video link)
noImageUrls = set()
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
prevUrlMatchesStripUrl = True
@ -55,7 +52,7 @@ class _BasicScraper(object):
"""Initialize internal variables."""
self.urls = set()
if indexes:
self.indexes = tuple(indexes)
self.indexes = tuple(sorted(indexes))
else:
self.indexes = tuple()
self.skippedUrls = set()
@ -66,7 +63,7 @@ class _BasicScraper(object):
if not isinstance(other, _BasicScraper):
return 1
# first, order by name
d = cmp(self.get_name(), other.get_name())
d = cmp(self.getName(), other.getName())
if d != 0:
return d
# then by indexes
@ -74,65 +71,41 @@ class _BasicScraper(object):
def __hash__(self):
"""Get hash value from name and index list."""
return hash((self.get_name(), self.indexes))
return hash((self.getName(), self.indexes))
def getCurrentStrips(self):
"""Get current comic strip."""
msg = 'Retrieving the current strip'
if self.indexes:
msg += " for indexes %s" % self.indexes
out.info(msg+"...")
if self.indexes:
for index in self.indexes:
url = self.stripUrl % index
if url in self.noImageUrls:
self.skipUrl(url)
else:
yield self.getStrip(url)
else:
url = self.getLatestUrl()
if url in self.noImageUrls:
self.skipUrl(url)
else:
yield self.getStrip(self.getLatestUrl())
def skipUrl(self, url):
"""Document that an URL had no images."""
out.info('Skipping URL %s without image' % url)
self.skippedUrls.add(url)
def getStrip(self, url):
"""Get comic strip for given URL."""
data, baseUrl = getPageContent(url, self.session)
return self.getComicStrip(url, data, baseUrl)
def shouldSkipUrl(self, url):
"""Determine if search for images in given URL should be skipped."""
return False
def getComicStrip(self, url, data, baseUrl):
"""Get comic strip downloader for given URL and data."""
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
imageUrls = set(map(self.imageUrlModifier, imageUrls))
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
out.warn("found %d images instead of 1 at %s with %s" % (len(imageUrls), url, self.imageSearch.pattern))
elif not imageUrls:
out.warn("found no images at %s with %s" % (url, self.imageSearch.pattern))
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
def getAllStrips(self, maxstrips=None):
"""Get all comic strips."""
def getStrips(self, maxstrips=None):
"""Get comic strips."""
if maxstrips:
msg = 'Retrieving %d strips' % maxstrips
word = "strip" if maxstrips == 1 else "strips"
msg = 'Retrieving %d %s' % (maxstrips, word)
else:
msg = 'Retrieving all strips'
if self.indexes:
msg += " for indexes %s" % self.indexes
if len(self.indexes) == 1:
msg += " for index %s" % self.indexes[0]
else:
msg += " for indexes %s" % self.indexes
urls = [self.getIndexStripUrl(index) for index in self.indexes]
else:
urls = [self.getLatestUrl()]
if self.adult:
msg += " (including adult content)"
out.info(msg)
if self.indexes:
for index in self.indexes:
url = self.stripUrl % index
for strip in self.getStripsFor(url, maxstrips):
yield strip
else:
url = self.getLatestUrl()
for url in urls:
for strip in self.getStripsFor(url, maxstrips):
yield strip
@ -142,42 +115,49 @@ class _BasicScraper(object):
self.hitFirstStripUrl = False
seen_urls = set()
while url:
out.info('Get strip URL %s' % url, level=1)
data, baseUrl = getPageContent(url, self.session)
if url in self.noImageUrls:
self.skipUrl(url)
if self.shouldSkipUrl(url):
out.info('Skipping URL %s' % url)
self.skippedUrls.add(url)
else:
yield self.getComicStrip(url, data, baseUrl)
if self.firstStripUrl == url:
out.debug("Stop at first URL %s" % url)
self.hitFirstStripUrl = True
break
prevUrl = None
if self.prevSearch:
try:
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
except ValueError as msg:
# assume there is no previous URL, but print a warning
out.warn("%s Assuming no previous comic strips exist." % msg)
else:
prevUrl = self.prevUrlModifier(prevUrl)
out.debug("Matched previous URL %s" % prevUrl)
if maxstrips is not None:
maxstrips -= 1
if maxstrips <= 0:
break
prevUrl = self.getPrevUrl(url, data, baseUrl)
seen_urls.add(url)
if prevUrl in seen_urls:
# avoid recursive URL loops
out.warn("Already seen previous URL %r" % prevUrl)
break
url = prevUrl
if maxstrips is not None:
maxstrips -= 1
if maxstrips <= 0:
break
def setStrip(self, index):
"""Set current comic strip URL."""
self.currentUrl = self.stripUrl % index
def getPrevUrl(self, url, data, baseUrl):
"""Find previous URL."""
prevUrl = None
if self.prevSearch:
try:
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
except ValueError as msg:
# assume there is no previous URL, but print a warning
out.warn("%s Assuming no previous comic strips exist." % msg)
else:
prevUrl = self.prevUrlModifier(prevUrl)
out.debug("Matched previous URL %s" % prevUrl)
return prevUrl
def getIndexStripUrl(self, index):
"""Get comic strip URL from index."""
return self.stripUrl % index
@classmethod
def get_name(cls):
def getName(cls):
"""Get scraper name."""
if hasattr(cls, 'name'):
return cls.name
@ -209,10 +189,6 @@ class _BasicScraper(object):
"""
return imageUrl
def getFilename(self, imageUrl, pageUrl):
"""Return filename for given image and page URL."""
return self.namer(imageUrl, pageUrl)
def getLatestUrl(self):
"""Get starter URL from where to scrape comic strips."""
return self.starter()
@ -227,7 +203,7 @@ def find_scraperclasses(comic, multiple_allowed=False):
candidates = []
cname = comic.lower()
for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower()
lname = scraperclass.getName().lower()
if lname == cname:
# perfect match
if not multiple_allowed:
@ -237,7 +213,7 @@ def find_scraperclasses(comic, multiple_allowed=False):
elif cname in lname:
candidates.append(scraperclass)
if len(candidates) > 1 and not multiple_allowed:
comics = ", ".join(x.get_name() for x in candidates)
comics = ", ".join(x.getName() for x in candidates)
raise ValueError('multiple comics found: %s' % comics)
elif not candidates:
raise ValueError('comic %r not found' % comic)
@ -266,10 +242,10 @@ def check_scrapers():
"""Check for duplicate scraper class names."""
d = {}
for scraperclass in _scraperclasses:
name = scraperclass.get_name().lower()
name = scraperclass.getName().lower()
if name in d:
name1 = scraperclass.get_name()
name2 = d[name].get_name()
name1 = scraperclass.getName()
name2 = d[name].getName()
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
d[name] = scraperclass

View file

@ -65,7 +65,7 @@ def has_comic(name):
("SmackJeeves/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower()
lname = scraperclass.getName().lower()
if lname in names:
return True
return False

View file

@ -275,7 +275,7 @@ def has_comic(name):
("Arcamax/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower()
lname = scraperclass.getName().lower()
if lname in names:
return True
return False

View file

@ -83,7 +83,7 @@ def has_creators_comic(name):
"""Test if comic name already exists."""
cname = "Creators/%s" % name
for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower()
lname = scraperclass.getName().lower()
if lname == cname.lower():
return True
return False

View file

@ -407,7 +407,7 @@ def has_comic(name):
cname = ("Creators/%s" % name).lower()
gname = ("GoComics/%s" % name).lower()
for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower()
lname = scraperclass.getName().lower()
if lname == cname or lname == gname:
return True
return False

View file

@ -291,7 +291,7 @@ def has_comic(name):
"""Check if comic name already exists."""
cname = name.lower()
for scraperclass in get_scraperclasses():
lname = scraperclass.get_name().lower()
lname = scraperclass.getName().lower()
if lname == cname:
return True
return False

View file

@ -1,6 +1,5 @@
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
# Dosage, the webcomic downloader
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam
from __future__ import print_function
@ -394,7 +393,8 @@ class MyRegister (register, object):
args = dict(
name = AppName,
version = AppVersion,
description = 'a commandline webcomic downloader and archiver',
description = 'a commandline comic downloader and archiver',
keywords = 'comic,webcomic,downloader,archiver',
author = 'Tristan Seligmann, Jonathan Jacobs, Bastian Kleineidam',
author_email = 'bastian.kleineidam@web.de',
maintainer = 'Bastian Kleineidam',

View file

@ -8,7 +8,7 @@ class TestComicNames(TestCase):
def test_names(self):
for scraperclass in scraper.get_scraperclasses():
name = scraperclass.get_name()
name = scraperclass.getName()
self.assertTrue(name.count('/') <= 1, name)
if '/' in name:
comicname = name.split('/')[1]

View file

@ -34,7 +34,7 @@ class _ComicTester(TestCase):
scraperclass=None
def setUp(self):
self.name = self.scraperclass.get_name()
self.name = self.scraperclass.getName()
self.url = self.scraperclass.starter()
# create a temporary directory for images
self.tmpdir = tempfile.mkdtemp()
@ -64,7 +64,7 @@ class _ComicTester(TestCase):
def _test_comic(self, scraperobj):
num = 0
max_strips = 5
for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
for strip in scraperobj.getStrips(max_strips):
images = []
for image in strip.getImages():
images.append(image.url)
@ -122,7 +122,7 @@ def generate_comic_testers():
g = globals()
if "TRAVIS" in os.environ:
# Get limited number of scraper tests on Travis builds.
max_scrapers = 1500
max_scrapers = 500
scraperclasses = islice(scraper.get_scraperclasses(), 0, max_scrapers)
else:
scraperclasses = scraper.get_scraperclasses()

View file

@ -52,6 +52,11 @@ class TestDosage (unittest.TestCase):
self.assertRaises(OSError, run_with_options, [])
self.assertRaises(OSError, run_with_options, ['--imadoofus'])
def test_fetch(self):
def test_fetch_html(self):
run_with_options(["-n", "2", "-b", self.tmpdir, "-o", "html", "calvinandhobbes"])
def test_fetch_rss(self):
run_with_options(["--numstrips", "2", "--baseurl", "bla", "--basepath", self.tmpdir, "--output", "rss", "--adult", "sexyloser"])
def test_fetch_indexed(self):
run_with_options(["-n", "2", "-b", self.tmpdir, "calvinandhobbes:2012/02/02"])