diff --git a/doc/README.txt b/doc/README.txt
index 8e37c349f..a7831ebe3 100644
--- a/doc/README.txt
+++ b/doc/README.txt
@@ -1,14 +1,14 @@
Dosage
=======
-Dosage is a commandline webcomic downloader and archiver.
+Dosage is a commandline comic downloader and archiver.
Introduction
-------------
Dosage is designed to keep a local copy of specific webcomics
and other picture-based content such as Picture of the Day sites.
With the dosage commandline script you can get the latest strip of
-webcomic, or catch-up to the last strip downloaded, or download a
+a webcomic, or catch-up to the last strip downloaded, or download a
strip for a particular date/index (if the webcomic's site layout
makes this possible).
@@ -91,7 +91,7 @@ Technical Description
Dosage is written in Python and relies on regular expressions to
do most of the grunt work.
-For each webcomic Dosage has a plugin module, found in the "plugins"
+For each comic Dosage has a plugin module, found in the "plugins"
subdirectory of the dosagelib directory. Each module is a subclass of
the _BasicComic class and specifies where to download its comic images.
Some comic syndicates (GoComics for example) have a standard layout for all
@@ -100,7 +100,7 @@ instances from a given list of comic strips.
Extending Dosage
-----------------
-In order to add a new webcomic, a new module class has to be created in
+In order to add a new comic, a new module class has to be created in
one of the *.py files in the dosagelib/plugins subdirectory.
Look at the existing module classes for examples.
diff --git a/doc/changelog.txt b/doc/changelog.txt
index 59c4e255f..3286db979 100644
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@@ -1,5 +1,10 @@
Dosage 1.13 (released xx.xx.2013)
+Features:
+- comics: Added comic strips AxeCop, Bearmageddon, DeadWinter,
+ HarkAVagrant, IAmArg, LoadingArtist, Nnewts, PHDComics, PokeyThePenguin,
+ SnowFlame and WorldOfMrToast.
+
Fixes:
- cmdline: Catch error when piping output to another
program or file under Windows.
diff --git a/doc/dosage.1 b/doc/dosage.1
index 9ab319e1b..725879441 100644
--- a/doc/dosage.1
+++ b/doc/dosage.1
@@ -1,6 +1,6 @@
.TH DOSAGE 1
.SH NAME
-dosage \- a commandline webcomic downloader and archiver
+dosage \- a commandline comic downloader and archiver
.SH SYNOPSIS
\fBdosage\fP [\fIoptions\fP] \fImodule\fP...
.SH DESCRIPTION
@@ -128,7 +128,7 @@ Retrieve the Penny Arcade strip for a given index:
Retrieve Calvin and Hobbes strips from a given index going backwards to
the beginning.
.RS
-.B dosage \-a calvinandhobbes:20120722
+.B dosage \-a calvinandhobbes:2012/07/22
.RE
.PP
On Unix, \fBxargs(1)\fP can download several comic strips in parallel,
diff --git a/doc/dosage.1.html b/doc/dosage.1.html
index 27e9a5ee0..6f04bde3d 100644
--- a/doc/dosage.1.html
+++ b/doc/dosage.1.html
@@ -9,7 +9,7 @@ Section: User Commands (1)
Index
NAME
-dosage - a commandline webcomic downloader and archiver
+dosage - a commandline comic downloader and archiver
SYNOPSIS
@@ -174,7 +174,7 @@ Retrieve the Penny Arcade strip for a given index:
Retrieve Calvin and Hobbes strips from a given index going backwards to
the beginning.
-
-dosage -a calvinandhobbes:20120722
+dosage -a calvinandhobbes:2012/07/22
diff --git a/doc/dosage.txt b/doc/dosage.txt
index 1cdd85457..28adfdfa5 100644
--- a/doc/dosage.txt
+++ b/doc/dosage.txt
@@ -3,7 +3,7 @@ DOSAGE(1) DOSAGE(1)
NAME
- dosage - a commandline webcomic downloader and archiver
+ dosage - a commandline comic downloader and archiver
SYNOPSIS
dosage [options] module...
@@ -116,7 +116,7 @@ EXAMPLES
Retrieve Calvin and Hobbes strips from a given index going
backwards to the beginning.
- dosage -a calvinandhobbes:20120722
+ dosage -a calvinandhobbes:2012/07/22
On Unix, xargs(1) can download several comic strips in paralā
lel, for example using up to 4 processes:
diff --git a/dosage b/dosage
index a15a980a9..07ca73289 100755
--- a/dosage
+++ b/dosage
@@ -1,6 +1,5 @@
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
-# Dosage, the webcomic downloader
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam
from __future__ import print_function
@@ -56,7 +55,7 @@ def setupOptions():
@rtype argparse.ArgumentParser
"""
kwargs = dict(
- description = "A commandline webcomic downloader and archiver.",
+ description = "A commandline comic downloader and archiver.",
epilog = Examples,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
@@ -131,7 +130,7 @@ def displayHelp(comics):
def displayComicHelp(scraperobj):
"""Print description and help for a comic."""
- out.context = scraperobj.get_name()
+ out.context = scraperobj.getName()
try:
if scraperobj.description:
out.info("Description: " + scraperobj.description)
@@ -163,15 +162,16 @@ def getComics(options):
def getStrips(scraperobj, options):
"""Get all strips from a scraper."""
errors = 0
- out.context = scraperobj.get_name()
+ out.context = scraperobj.getName()
if options.all:
- strips = scraperobj.getAllStrips()
+ numstrips = None
elif options.numstrips:
- strips = scraperobj.getAllStrips(options.numstrips)
+ numstrips = options.numstrips
else:
- strips = scraperobj.getCurrentStrips()
+ # get current strip
+ numstrips = 1
try:
- for strip in strips:
+ for strip in scraperobj.getStrips(numstrips):
_errors, skipped = saveComicStrip(strip, options.basepath)
errors += _errors
if skipped and options.cont:
@@ -206,7 +206,7 @@ def doList(columnList=True):
"""List available comics."""
out.info('Available comic scrapers:')
out.info('Comics marked with [A] require age confirmation with the --adult option.')
- scrapers = sorted(getScrapers(['@@']), key=lambda s: s.get_name())
+ scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName())
try:
if columnList:
num = doColumnList(scrapers)
@@ -243,7 +243,7 @@ def doColumnList(scrapers):
def getScraperName(scraperobj, limit=None):
"""Get comic scraper name."""
suffix = " [A]" if scraperobj.adult else ""
- name = scraperobj.get_name()
+ name = scraperobj.getName()
if limit is not None:
name = strlimit(name, limit)
return name + suffix
@@ -259,7 +259,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
if not adult and scraperclass.adult:
warn_adult(scraperclass)
continue
- dirname = getDirname(scraperclass.get_name())
+ dirname = getDirname(scraperclass.getName())
if os.path.isdir(os.path.join(basepath, dirname)):
yield scraperclass()
elif '@@' in comics:
@@ -293,7 +293,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
def warn_adult(scraperclass):
"""Print warning about adult content."""
- out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.get_name())
+ out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())
def main():
diff --git a/dosagelib/__init__.py b/dosagelib/__init__.py
index 0814ee8ad..3f8807f24 100644
--- a/dosagelib/__init__.py
+++ b/dosagelib/__init__.py
@@ -1,8 +1,8 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
-# Copyright (C) 2012 Bastian Kleineidam
+# Copyright (C) 2012-2013 Bastian Kleineidam
"""
-Automated webcomic downloader. Dosage traverses webcomic websites in
+Automated comic downloader. Dosage traverses comic websites in
order to download each strip of the comic. The intended use is for
mirroring the strips locally for ease of viewing; redistribution of the
downloaded strips may violate copyright, and is not advisable unless you
@@ -11,7 +11,7 @@ your intentions, and received permission to distribute.
The primary dosage interface is currently the 'mainline' script, which
is just a thin wrapper that invokes L{dosage.mainline}. Comic modules
-for each webcomic are located in L{dosage.modules}; most of these make
+for each comic are located in L{dosage.modules}; most of these make
use of the helper base classes and mixins in L{dosage.modules.helpers},
thus making their individual implementations trivial.
"""
diff --git a/dosagelib/comic.py b/dosagelib/comic.py
index e991dd9be..6c5bf28b0 100644
--- a/dosagelib/comic.py
+++ b/dosagelib/comic.py
@@ -88,6 +88,7 @@ class ComicImage(object):
def save(self, basepath):
"""Save comic URL to filename on disk."""
+ out.info("Get image URL %s" % self.url, level=1)
self.connect()
filename = "%s%s" % (self.filename, self.ext)
comicSize = self.contentLength
@@ -96,6 +97,7 @@ class ComicImage(object):
os.makedirs(comicDir)
fn = os.path.join(comicDir, filename)
+ # compare with >= since comicSize could be the compressed size
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
self.touch(fn)
out.info('Skipping existing file "%s".' % fn)
diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py
index f44fbe8da..4b13d96df 100644
--- a/dosagelib/plugins/a.py
+++ b/dosagelib/plugins/a.py
@@ -159,14 +159,17 @@ class AstronomyPOTD(_BasicScraper):
stripUrl = 'http://antwrp.gsfc.nasa.gov/apod/ap%s.html'
imageSearch = compile(tagre("a", "href", r'(image/\d{4}/[^"]+)'))
multipleImagesPerStrip = True
- noImageUrls = set([
- 'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video
- 'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video
- 'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video
- ])
prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "<")
help = 'Index format: yymmdd'
+ def shouldSkipUrl(self, url):
+ """Skip pages without images."""
+ return url in (
+ 'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video
+ 'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video
+ 'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video
+ )
+
@classmethod
def namer(cls, imageUrl, pageUrl):
return '%s-%s' % (pageUrl.split('/')[-1].split('.')[0][2:],
@@ -269,3 +272,14 @@ class Annyseed(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(Annyseed[^"]+)'))
prevSearch = compile(r']*>Previous Strip', IGNORECASE)
help = 'Index format: yyyy-mm-dd'
- def setStrip(self, index):
- self.currentUrl = self.stripUrl % tuple(map(int, index.split('-')))
+ def getIndexStripUrl(self, index):
+ return self.stripUrl % tuple(map(int, index.split('-')))
class KhaosKomix(_BasicScraper):
diff --git a/dosagelib/plugins/l.py b/dosagelib/plugins/l.py
index 027f5e402..5cd6bc91a 100644
--- a/dosagelib/plugins/l.py
+++ b/dosagelib/plugins/l.py
@@ -24,6 +24,15 @@ class Lint(_BasicScraper):
help = 'Index format: yyyy/mm/dd/num-name'
+class LoadingArtist(_BasicScraper):
+ url = 'http://www.loadingartist.com/'
+ stripUrl = url + '%s/'
+ firstStripUrl = stripUrl % '2011/01/04/born'
+ imageSearch = compile(tagre("img", "src", r'(http://www\.loadingartist\.com/comics/[^"]+)'))
+ prevSearch = compile(tagre("a", "href", r'(http://www\.loadingartist\.com/\d+/\d+/\d+/[^"]+/)', after="prev"))
+ help = 'Index format: yyyy/mm/dd/stripname'
+
+
class LookingForGroup(_BasicScraper):
url = 'http://www.lfgcomic.com/'
stripUrl = url + 'page/%s/'
@@ -33,6 +42,7 @@ class LookingForGroup(_BasicScraper):
nameSearch = compile(r'/page/(\d+)/')
help = 'Index format: nnn'
+ @classmethod
def namer(self, imageUrl, pageUrl):
return self.nameSearch.search(pageUrl).group(1)
diff --git a/dosagelib/plugins/n.py b/dosagelib/plugins/n.py
index 5133f76fd..a1e86f82f 100644
--- a/dosagelib/plugins/n.py
+++ b/dosagelib/plugins/n.py
@@ -69,6 +69,15 @@ class Nicky510(_BasicScraper):
help = 'Index format: stripname'
+class Nnewts(_BasicScraper):
+ url = 'http://nnewts.com/'
+ stripUrl = url + '%s/'
+ firstStripUrl = stripUrl % 'nnewts-page-1'
+ imageSearch = compile(tagre("img", "src", r'(http://nnewts\.com/newty/comics/[^"]+)'))
+ prevSearch = compile(tagre("a", "href", r'(http://nnewts\.com/(?:nnewts-)?page-\d+/)', after="navi-prev"))
+ help = 'Index format: page-number'
+
+
class NoNeedForBushido(_BasicScraper):
url = 'http://noneedforbushido.com/latest/'
stripUrl = 'http://noneedforbushido.com/%s/'
diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py
index 559f72de9..77f776d72 100644
--- a/dosagelib/plugins/p.py
+++ b/dosagelib/plugins/p.py
@@ -83,6 +83,17 @@ class PeppermintSaga(_BasicScraper):
help = 'Index format: number'
+class PHDComics(_BasicScraper):
+ baseurl = 'http://phdcomics.com/'
+ url = baseurl + 'comics.php'
+ stripUrl = baseurl + 'comics/archive.php?comicid=%s'
+ firstStripUrl = stripUrl % '1'
+ imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd[^ ]+)', quote=""))
+ prevSearch = compile(tagre("a", "href", r'((?:comics/)?archive\.php\?comicid=\d+)', quote="") +
+ tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
+ help = 'Index format: number'
+
+
class PicPakDog(_BasicScraper):
url = 'http://www.picpak.net/'
stripUrl = url + 'comic/%s/'
@@ -117,6 +128,23 @@ class Pimpette(_BasicScraper):
help = 'Index format: yyyymmdd'
+class PokeyThePenguin(_BasicScraper):
+ baseurl = 'http://www.yellow5.com/pokey/archive/'
+ url = baseurl + 'index558.html'
+ stripUrl = baseurl + 'index%s.html'
+ firstStripUrl = stripUrl % '1'
+ imageSearch = compile(tagre("img", "src", r'(pokey\d+[^"]+)'))
+ multipleImagesPerStrip = True
+ help = 'Index format: number'
+
+ def getPrevUrl(self, url, data, baseUrl):
+ """Decrease index.html number."""
+ mo = compile(r"index(\d+)\.html").search(url)
+ num = int(mo.group(1)) - 1
+ prefix = url.rsplit('/', 1)[0]
+ return "%s/index%d.html" % (prefix, num)
+
+
class Precocious(_BasicScraper):
url = 'http://www.precociouscomic.com/'
starter = indirectStarter(url,
diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py
index fb2cd06de..febffa32e 100644
--- a/dosagelib/plugins/s.py
+++ b/dosagelib/plugins/s.py
@@ -5,7 +5,7 @@
from re import compile, MULTILINE, IGNORECASE, sub
from os.path import splitext
from ..scraper import _BasicScraper
-from ..helpers import indirectStarter
+from ..helpers import indirectStarter, bounceStarter
from ..util import tagre
@@ -150,6 +150,31 @@ class SluggyFreelance(_BasicScraper):
help = 'Index format: yymmdd'
+class SnowFlame(_BasicScraper):
+ url = 'http://www.snowflamecomic.com/'
+ stripUrl = url + '?comic=snowflame-%s-%s'
+ firstStripUrl = stripUrl % ('01', '01')
+ imageSearch = compile(tagre("img", "src", r'(http://www\.snowflamecomic\.com/wp-content/uploads/\d+/\d+/[^"]+)'))
+ prevSearch = compile(tagre("span", "class", "mininav-prev") +
+ tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)'))
+ starter = bounceStarter(url,
+ compile(tagre("span", "class", "mininav-next") +
+ tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)')))
+ help = 'Index format: chapter-page'
+
+ def getStripIndexUrl(self, index):
+ return self.stripUrl % index.split('-')
+
+ @classmethod
+ def namer(cls, imageUrl, pageUrl):
+ prefix, filename = imageUrl.rsplit('/', 1)
+ ro = compile(r'snowflame-([^-]+)-([^-]+)')
+ mo = ro.search(pageUrl)
+ chapter = mo.group(1)
+ page = mo.group(2)
+ return "%s-%s-%s" % (chapter, page, filename)
+
+
class SodiumEyes(_BasicScraper):
url = 'http://sodiumeyes.com/'
stripUrl = url + '%s/'
diff --git a/dosagelib/plugins/w.py b/dosagelib/plugins/w.py
index 0b55a6261..633b00f36 100644
--- a/dosagelib/plugins/w.py
+++ b/dosagelib/plugins/w.py
@@ -57,6 +57,40 @@ class Wonderella(_BasicScraper):
help = 'Index format: yyyy/mm/dd/name'
+class WorldOfMrToast(_BasicScraper):
+ baseurl = 'http://www.theimaginaryworld.com/'
+ url = baseurl + 'mrTcomicA.html'
+ stripUrl = baseurl + '%s.html'
+ imageSearch = compile(tagre("img", "src", r'(comic[^"]+)'))
+ # list the archive links since there is no prev/next navigation
+ prevurls = (
+ url,
+ baseurl + 'mrTcomicW02.html',
+ baseurl + 'mrTcomicW01.html',
+ baseurl + 'mrGcomic03.html',
+ baseurl + 'mrGcomic02.html',
+ baseurl + 'mrGcomic01.html',
+ baseurl + 'mrTcomicT05.html',
+ baseurl + 'mrTcomicT04.html',
+ baseurl + 'mrTcomicT03.html',
+ baseurl + 'mrTcomicT02.html',
+ baseurl + 'mrTcomicT01.html',
+ baseurl + 'mrTcomicIW3.html',
+ baseurl + 'mrTcomicIW2.html',
+ baseurl + 'mrTcomicIW1.html',
+ )
+ firstStripUrl = prevurls[-1]
+ multipleImagesPerStrip = True
+ help = 'Index format: none'
+
+ def getPrevUrl(self, url, data, baseUrl):
+ idx = self.prevurls.index(url)
+ try:
+ return self.prevurls[idx+1]
+ except IndexError:
+ return None
+
+
class WotNow(_BasicScraper):
url = 'http://shadowburn.binmode.com/wotnow/'
stripUrl = url + 'comic.php?comic_id=%s'
diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py
index df7367e2d..3d924ea37 100644
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@@ -33,9 +33,6 @@ class _BasicScraper(object):
# if more than one image per URL is expected
multipleImagesPerStrip = False
- # set of URLs that have no image (eg. only a video link)
- noImageUrls = set()
-
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
prevUrlMatchesStripUrl = True
@@ -55,7 +52,7 @@ class _BasicScraper(object):
"""Initialize internal variables."""
self.urls = set()
if indexes:
- self.indexes = tuple(indexes)
+ self.indexes = tuple(sorted(indexes))
else:
self.indexes = tuple()
self.skippedUrls = set()
@@ -66,7 +63,7 @@ class _BasicScraper(object):
if not isinstance(other, _BasicScraper):
return 1
# first, order by name
- d = cmp(self.get_name(), other.get_name())
+ d = cmp(self.getName(), other.getName())
if d != 0:
return d
# then by indexes
@@ -74,65 +71,41 @@ class _BasicScraper(object):
def __hash__(self):
"""Get hash value from name and index list."""
- return hash((self.get_name(), self.indexes))
+ return hash((self.getName(), self.indexes))
- def getCurrentStrips(self):
- """Get current comic strip."""
- msg = 'Retrieving the current strip'
- if self.indexes:
- msg += " for indexes %s" % self.indexes
- out.info(msg+"...")
- if self.indexes:
- for index in self.indexes:
- url = self.stripUrl % index
- if url in self.noImageUrls:
- self.skipUrl(url)
- else:
- yield self.getStrip(url)
-
- else:
- url = self.getLatestUrl()
- if url in self.noImageUrls:
- self.skipUrl(url)
- else:
- yield self.getStrip(self.getLatestUrl())
-
- def skipUrl(self, url):
- """Document that an URL had no images."""
- out.info('Skipping URL %s without image' % url)
- self.skippedUrls.add(url)
-
- def getStrip(self, url):
- """Get comic strip for given URL."""
- data, baseUrl = getPageContent(url, self.session)
- return self.getComicStrip(url, data, baseUrl)
+ def shouldSkipUrl(self, url):
+ """Determine if search for images in given URL should be skipped."""
+ return False
def getComicStrip(self, url, data, baseUrl):
"""Get comic strip downloader for given URL and data."""
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
imageUrls = set(map(self.imageUrlModifier, imageUrls))
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
- out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
- return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
+ out.warn("found %d images instead of 1 at %s with %s" % (len(imageUrls), url, self.imageSearch.pattern))
+ elif not imageUrls:
+ out.warn("found no images at %s with %s" % (url, self.imageSearch.pattern))
+ return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
- def getAllStrips(self, maxstrips=None):
- """Get all comic strips."""
+ def getStrips(self, maxstrips=None):
+ """Get comic strips."""
if maxstrips:
- msg = 'Retrieving %d strips' % maxstrips
+ word = "strip" if maxstrips == 1 else "strips"
+ msg = 'Retrieving %d %s' % (maxstrips, word)
else:
msg = 'Retrieving all strips'
if self.indexes:
- msg += " for indexes %s" % self.indexes
+ if len(self.indexes) == 1:
+ msg += " for index %s" % self.indexes[0]
+ else:
+ msg += " for indexes %s" % self.indexes
+ urls = [self.getIndexStripUrl(index) for index in self.indexes]
+ else:
+ urls = [self.getLatestUrl()]
if self.adult:
msg += " (including adult content)"
out.info(msg)
- if self.indexes:
- for index in self.indexes:
- url = self.stripUrl % index
- for strip in self.getStripsFor(url, maxstrips):
- yield strip
- else:
- url = self.getLatestUrl()
+ for url in urls:
for strip in self.getStripsFor(url, maxstrips):
yield strip
@@ -142,42 +115,49 @@ class _BasicScraper(object):
self.hitFirstStripUrl = False
seen_urls = set()
while url:
+ out.info('Get strip URL %s' % url, level=1)
data, baseUrl = getPageContent(url, self.session)
- if url in self.noImageUrls:
- self.skipUrl(url)
+ if self.shouldSkipUrl(url):
+ out.info('Skipping URL %s' % url)
+ self.skippedUrls.add(url)
else:
yield self.getComicStrip(url, data, baseUrl)
if self.firstStripUrl == url:
out.debug("Stop at first URL %s" % url)
self.hitFirstStripUrl = True
break
- prevUrl = None
- if self.prevSearch:
- try:
- prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
- except ValueError as msg:
- # assume there is no previous URL, but print a warning
- out.warn("%s Assuming no previous comic strips exist." % msg)
- else:
- prevUrl = self.prevUrlModifier(prevUrl)
- out.debug("Matched previous URL %s" % prevUrl)
+ if maxstrips is not None:
+ maxstrips -= 1
+ if maxstrips <= 0:
+ break
+ prevUrl = self.getPrevUrl(url, data, baseUrl)
seen_urls.add(url)
if prevUrl in seen_urls:
# avoid recursive URL loops
out.warn("Already seen previous URL %r" % prevUrl)
break
url = prevUrl
- if maxstrips is not None:
- maxstrips -= 1
- if maxstrips <= 0:
- break
- def setStrip(self, index):
- """Set current comic strip URL."""
- self.currentUrl = self.stripUrl % index
+ def getPrevUrl(self, url, data, baseUrl):
+ """Find previous URL."""
+ prevUrl = None
+ if self.prevSearch:
+ try:
+ prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
+ except ValueError as msg:
+ # assume there is no previous URL, but print a warning
+ out.warn("%s Assuming no previous comic strips exist." % msg)
+ else:
+ prevUrl = self.prevUrlModifier(prevUrl)
+ out.debug("Matched previous URL %s" % prevUrl)
+ return prevUrl
+
+ def getIndexStripUrl(self, index):
+ """Get comic strip URL from index."""
+ return self.stripUrl % index
@classmethod
- def get_name(cls):
+ def getName(cls):
"""Get scraper name."""
if hasattr(cls, 'name'):
return cls.name
@@ -209,10 +189,6 @@ class _BasicScraper(object):
"""
return imageUrl
- def getFilename(self, imageUrl, pageUrl):
- """Return filename for given image and page URL."""
- return self.namer(imageUrl, pageUrl)
-
def getLatestUrl(self):
"""Get starter URL from where to scrape comic strips."""
return self.starter()
@@ -227,7 +203,7 @@ def find_scraperclasses(comic, multiple_allowed=False):
candidates = []
cname = comic.lower()
for scraperclass in get_scraperclasses():
- lname = scraperclass.get_name().lower()
+ lname = scraperclass.getName().lower()
if lname == cname:
# perfect match
if not multiple_allowed:
@@ -237,7 +213,7 @@ def find_scraperclasses(comic, multiple_allowed=False):
elif cname in lname:
candidates.append(scraperclass)
if len(candidates) > 1 and not multiple_allowed:
- comics = ", ".join(x.get_name() for x in candidates)
+ comics = ", ".join(x.getName() for x in candidates)
raise ValueError('multiple comics found: %s' % comics)
elif not candidates:
raise ValueError('comic %r not found' % comic)
@@ -266,10 +242,10 @@ def check_scrapers():
"""Check for duplicate scraper class names."""
d = {}
for scraperclass in _scraperclasses:
- name = scraperclass.get_name().lower()
+ name = scraperclass.getName().lower()
if name in d:
- name1 = scraperclass.get_name()
- name2 = d[name].get_name()
+ name1 = scraperclass.getName()
+ name2 = d[name].getName()
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
d[name] = scraperclass
diff --git a/scripts/arcamax.py b/scripts/arcamax.py
index 8e0b6f4b2..e2ace6d09 100755
--- a/scripts/arcamax.py
+++ b/scripts/arcamax.py
@@ -65,7 +65,7 @@ def has_comic(name):
("SmackJeeves/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
- lname = scraperclass.get_name().lower()
+ lname = scraperclass.getName().lower()
if lname in names:
return True
return False
diff --git a/scripts/comicfury.py b/scripts/comicfury.py
index dbd2a4b5d..0103b6ace 100755
--- a/scripts/comicfury.py
+++ b/scripts/comicfury.py
@@ -275,7 +275,7 @@ def has_comic(name):
("Arcamax/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
- lname = scraperclass.get_name().lower()
+ lname = scraperclass.getName().lower()
if lname in names:
return True
return False
diff --git a/scripts/gocomics.py b/scripts/gocomics.py
index e5e48f213..455b95d56 100755
--- a/scripts/gocomics.py
+++ b/scripts/gocomics.py
@@ -83,7 +83,7 @@ def has_creators_comic(name):
"""Test if comic name already exists."""
cname = "Creators/%s" % name
for scraperclass in get_scraperclasses():
- lname = scraperclass.get_name().lower()
+ lname = scraperclass.getName().lower()
if lname == cname.lower():
return True
return False
diff --git a/scripts/keenspot.py b/scripts/keenspot.py
index 6f22a4d8f..29901e024 100755
--- a/scripts/keenspot.py
+++ b/scripts/keenspot.py
@@ -407,7 +407,7 @@ def has_comic(name):
cname = ("Creators/%s" % name).lower()
gname = ("GoComics/%s" % name).lower()
for scraperclass in get_scraperclasses():
- lname = scraperclass.get_name().lower()
+ lname = scraperclass.getName().lower()
if lname == cname or lname == gname:
return True
return False
diff --git a/scripts/smackjeeves.py b/scripts/smackjeeves.py
index eccc8a2c6..953b7959f 100755
--- a/scripts/smackjeeves.py
+++ b/scripts/smackjeeves.py
@@ -291,7 +291,7 @@ def has_comic(name):
"""Check if comic name already exists."""
cname = name.lower()
for scraperclass in get_scraperclasses():
- lname = scraperclass.get_name().lower()
+ lname = scraperclass.getName().lower()
if lname == cname:
return True
return False
diff --git a/setup.py b/setup.py
index fe2ae4691..24cbc401a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,5 @@
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
-# Dosage, the webcomic downloader
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam
from __future__ import print_function
@@ -394,7 +393,8 @@ class MyRegister (register, object):
args = dict(
name = AppName,
version = AppVersion,
- description = 'a commandline webcomic downloader and archiver',
+ description = 'a commandline comic downloader and archiver',
+ keywords = 'comic,webcomic,downloader,archiver',
author = 'Tristan Seligmann, Jonathan Jacobs, Bastian Kleineidam',
author_email = 'bastian.kleineidam@web.de',
maintainer = 'Bastian Kleineidam',
diff --git a/tests/test_comicnames.py b/tests/test_comicnames.py
index 8aea08b16..932729c82 100644
--- a/tests/test_comicnames.py
+++ b/tests/test_comicnames.py
@@ -8,7 +8,7 @@ class TestComicNames(TestCase):
def test_names(self):
for scraperclass in scraper.get_scraperclasses():
- name = scraperclass.get_name()
+ name = scraperclass.getName()
self.assertTrue(name.count('/') <= 1, name)
if '/' in name:
comicname = name.split('/')[1]
diff --git a/tests/test_comics.py b/tests/test_comics.py
index fbc3bdf28..c250c96b5 100644
--- a/tests/test_comics.py
+++ b/tests/test_comics.py
@@ -34,7 +34,7 @@ class _ComicTester(TestCase):
scraperclass=None
def setUp(self):
- self.name = self.scraperclass.get_name()
+ self.name = self.scraperclass.getName()
self.url = self.scraperclass.starter()
# create a temporary directory for images
self.tmpdir = tempfile.mkdtemp()
@@ -64,7 +64,7 @@ class _ComicTester(TestCase):
def _test_comic(self, scraperobj):
num = 0
max_strips = 5
- for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
+ for strip in scraperobj.getStrips(max_strips):
images = []
for image in strip.getImages():
images.append(image.url)
@@ -122,7 +122,7 @@ def generate_comic_testers():
g = globals()
if "TRAVIS" in os.environ:
# Get limited number of scraper tests on Travis builds.
- max_scrapers = 1500
+ max_scrapers = 500
scraperclasses = islice(scraper.get_scraperclasses(), 0, max_scrapers)
else:
scraperclasses = scraper.get_scraperclasses()
diff --git a/tests/test_dosage.py b/tests/test_dosage.py
index d4440b085..6b6ceea64 100644
--- a/tests/test_dosage.py
+++ b/tests/test_dosage.py
@@ -52,6 +52,11 @@ class TestDosage (unittest.TestCase):
self.assertRaises(OSError, run_with_options, [])
self.assertRaises(OSError, run_with_options, ['--imadoofus'])
- def test_fetch(self):
+ def test_fetch_html(self):
run_with_options(["-n", "2", "-b", self.tmpdir, "-o", "html", "calvinandhobbes"])
+
+ def test_fetch_rss(self):
run_with_options(["--numstrips", "2", "--baseurl", "bla", "--basepath", self.tmpdir, "--output", "rss", "--adult", "sexyloser"])
+
+ def test_fetch_indexed(self):
+ run_with_options(["-n", "2", "-b", self.tmpdir, "calvinandhobbes:2012/02/02"])