diff --git a/doc/README.txt b/doc/README.txt index 8e37c349f..a7831ebe3 100644 --- a/doc/README.txt +++ b/doc/README.txt @@ -1,14 +1,14 @@ Dosage ======= -Dosage is a commandline webcomic downloader and archiver. +Dosage is a commandline comic downloader and archiver. Introduction ------------- Dosage is designed to keep a local copy of specific webcomics and other picture-based content such as Picture of the Day sites. With the dosage commandline script you can get the latest strip of -webcomic, or catch-up to the last strip downloaded, or download a +a webcomic, or catch-up to the last strip downloaded, or download a strip for a particular date/index (if the webcomic's site layout makes this possible). @@ -91,7 +91,7 @@ Technical Description Dosage is written in Python and relies on regular expressions to do most of the grunt work. -For each webcomic Dosage has a plugin module, found in the "plugins" +For each comic Dosage has a plugin module, found in the "plugins" subdirectory of the dosagelib directory. Each module is a subclass of the _BasicComic class and specifies where to download its comic images. Some comic syndicates (GoComics for example) have a standard layout for all @@ -100,7 +100,7 @@ instances from a given list of comic strips. Extending Dosage ----------------- -In order to add a new webcomic, a new module class has to be created in +In order to add a new comic, a new module class has to be created in one of the *.py files in the dosagelib/plugins subdirectory. Look at the existing module classes for examples. diff --git a/doc/changelog.txt b/doc/changelog.txt index 59c4e255f..3286db979 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -1,5 +1,10 @@ Dosage 1.13 (released xx.xx.2013) +Features: +- comics: Added comic strips AxeCop, Bearmageddon, DeadWinter, + HarkAVagrant, IAmArg, LoadingArtist, Nnewts, PHDComics, PokeyThePenguin, + SnowFlame and WorldOfMrToast. + Fixes: - cmdline: Catch error when piping output to another program or file under Windows. diff --git a/doc/dosage.1 b/doc/dosage.1 index 9ab319e1b..725879441 100644 --- a/doc/dosage.1 +++ b/doc/dosage.1 @@ -1,6 +1,6 @@ .TH DOSAGE 1 .SH NAME -dosage \- a commandline webcomic downloader and archiver +dosage \- a commandline comic downloader and archiver .SH SYNOPSIS \fBdosage\fP [\fIoptions\fP] \fImodule\fP... .SH DESCRIPTION @@ -128,7 +128,7 @@ Retrieve the Penny Arcade strip for a given index: Retrieve Calvin and Hobbes strips from a given index going backwards to the beginning. .RS -.B dosage \-a calvinandhobbes:20120722 +.B dosage \-a calvinandhobbes:2012/07/22 .RE .PP On Unix, \fBxargs(1)\fP can download several comic strips in parallel, diff --git a/doc/dosage.1.html b/doc/dosage.1.html index 27e9a5ee0..6f04bde3d 100644 --- a/doc/dosage.1.html +++ b/doc/dosage.1.html @@ -9,7 +9,7 @@ Section: User Commands (1)
Index  

NAME

-dosage - a commandline webcomic downloader and archiver +dosage - a commandline comic downloader and archiver  

SYNOPSIS

@@ -174,7 +174,7 @@ Retrieve the Penny Arcade strip for a given index: Retrieve Calvin and Hobbes strips from a given index going backwards to the beginning.
-dosage -a calvinandhobbes:20120722 +dosage -a calvinandhobbes:2012/07/22
diff --git a/doc/dosage.txt b/doc/dosage.txt index 1cdd85457..28adfdfa5 100644 --- a/doc/dosage.txt +++ b/doc/dosage.txt @@ -3,7 +3,7 @@ DOSAGE(1) DOSAGE(1) NAME - dosage - a commandline webcomic downloader and archiver + dosage - a commandline comic downloader and archiver SYNOPSIS dosage [options] module... @@ -116,7 +116,7 @@ EXAMPLES Retrieve Calvin and Hobbes strips from a given index going backwards to the beginning. - dosage -a calvinandhobbes:20120722 + dosage -a calvinandhobbes:2012/07/22 On Unix, xargs(1) can download several comic strips in paralā€ lel, for example using up to 4 processes: diff --git a/dosage b/dosage index a15a980a9..07ca73289 100755 --- a/dosage +++ b/dosage @@ -1,6 +1,5 @@ #!/usr/bin/env python # -*- coding: iso-8859-1 -*- -# Dosage, the webcomic downloader # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2013 Bastian Kleineidam from __future__ import print_function @@ -56,7 +55,7 @@ def setupOptions(): @rtype argparse.ArgumentParser """ kwargs = dict( - description = "A commandline webcomic downloader and archiver.", + description = "A commandline comic downloader and archiver.", epilog = Examples, formatter_class=argparse.RawDescriptionHelpFormatter, ) @@ -131,7 +130,7 @@ def displayHelp(comics): def displayComicHelp(scraperobj): """Print description and help for a comic.""" - out.context = scraperobj.get_name() + out.context = scraperobj.getName() try: if scraperobj.description: out.info("Description: " + scraperobj.description) @@ -163,15 +162,16 @@ def getComics(options): def getStrips(scraperobj, options): """Get all strips from a scraper.""" errors = 0 - out.context = scraperobj.get_name() + out.context = scraperobj.getName() if options.all: - strips = scraperobj.getAllStrips() + numstrips = None elif options.numstrips: - strips = scraperobj.getAllStrips(options.numstrips) + numstrips = options.numstrips else: - strips = scraperobj.getCurrentStrips() + # get current strip + numstrips = 1 try: - for strip in strips: + for strip in scraperobj.getStrips(numstrips): _errors, skipped = saveComicStrip(strip, options.basepath) errors += _errors if skipped and options.cont: @@ -206,7 +206,7 @@ def doList(columnList=True): """List available comics.""" out.info('Available comic scrapers:') out.info('Comics marked with [A] require age confirmation with the --adult option.') - scrapers = sorted(getScrapers(['@@']), key=lambda s: s.get_name()) + scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName()) try: if columnList: num = doColumnList(scrapers) @@ -243,7 +243,7 @@ def doColumnList(scrapers): def getScraperName(scraperobj, limit=None): """Get comic scraper name.""" suffix = " [A]" if scraperobj.adult else "" - name = scraperobj.get_name() + name = scraperobj.getName() if limit is not None: name = strlimit(name, limit) return name + suffix @@ -259,7 +259,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False): if not adult and scraperclass.adult: warn_adult(scraperclass) continue - dirname = getDirname(scraperclass.get_name()) + dirname = getDirname(scraperclass.getName()) if os.path.isdir(os.path.join(basepath, dirname)): yield scraperclass() elif '@@' in comics: @@ -293,7 +293,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False): def warn_adult(scraperclass): """Print warning about adult content.""" - out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.get_name()) + out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName()) def main(): diff --git a/dosagelib/__init__.py b/dosagelib/__init__.py index 0814ee8ad..3f8807f24 100644 --- a/dosagelib/__init__.py +++ b/dosagelib/__init__.py @@ -1,8 +1,8 @@ # -*- coding: iso-8859-1 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012 Bastian Kleineidam +# Copyright (C) 2012-2013 Bastian Kleineidam """ -Automated webcomic downloader. Dosage traverses webcomic websites in +Automated comic downloader. Dosage traverses comic websites in order to download each strip of the comic. The intended use is for mirroring the strips locally for ease of viewing; redistribution of the downloaded strips may violate copyright, and is not advisable unless you @@ -11,7 +11,7 @@ your intentions, and received permission to distribute. The primary dosage interface is currently the 'mainline' script, which is just a thin wrapper that invokes L{dosage.mainline}. Comic modules -for each webcomic are located in L{dosage.modules}; most of these make +for each comic are located in L{dosage.modules}; most of these make use of the helper base classes and mixins in L{dosage.modules.helpers}, thus making their individual implementations trivial. """ diff --git a/dosagelib/comic.py b/dosagelib/comic.py index e991dd9be..6c5bf28b0 100644 --- a/dosagelib/comic.py +++ b/dosagelib/comic.py @@ -88,6 +88,7 @@ class ComicImage(object): def save(self, basepath): """Save comic URL to filename on disk.""" + out.info("Get image URL %s" % self.url, level=1) self.connect() filename = "%s%s" % (self.filename, self.ext) comicSize = self.contentLength @@ -96,6 +97,7 @@ class ComicImage(object): os.makedirs(comicDir) fn = os.path.join(comicDir, filename) + # compare with >= since comicSize could be the compressed size if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize: self.touch(fn) out.info('Skipping existing file "%s".' % fn) diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py index f44fbe8da..4b13d96df 100644 --- a/dosagelib/plugins/a.py +++ b/dosagelib/plugins/a.py @@ -159,14 +159,17 @@ class AstronomyPOTD(_BasicScraper): stripUrl = 'http://antwrp.gsfc.nasa.gov/apod/ap%s.html' imageSearch = compile(tagre("a", "href", r'(image/\d{4}/[^"]+)')) multipleImagesPerStrip = True - noImageUrls = set([ - 'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video - 'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video - 'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video - ]) prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "<") help = 'Index format: yymmdd' + def shouldSkipUrl(self, url): + """Skip pages without images.""" + return url in ( + 'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video + 'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video + 'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video + ) + @classmethod def namer(cls, imageUrl, pageUrl): return '%s-%s' % (pageUrl.split('/')[-1].split('.')[0][2:], @@ -269,3 +272,14 @@ class Annyseed(_BasicScraper): imageSearch = compile(tagre("img", "src", r'(Annyseed[^"]+)')) prevSearch = compile(r']*>Previous Strip', IGNORECASE) help = 'Index format: yyyy-mm-dd' - def setStrip(self, index): - self.currentUrl = self.stripUrl % tuple(map(int, index.split('-'))) + def getIndexStripUrl(self, index): + return self.stripUrl % tuple(map(int, index.split('-'))) class KhaosKomix(_BasicScraper): diff --git a/dosagelib/plugins/l.py b/dosagelib/plugins/l.py index 027f5e402..5cd6bc91a 100644 --- a/dosagelib/plugins/l.py +++ b/dosagelib/plugins/l.py @@ -24,6 +24,15 @@ class Lint(_BasicScraper): help = 'Index format: yyyy/mm/dd/num-name' +class LoadingArtist(_BasicScraper): + url = 'http://www.loadingartist.com/' + stripUrl = url + '%s/' + firstStripUrl = stripUrl % '2011/01/04/born' + imageSearch = compile(tagre("img", "src", r'(http://www\.loadingartist\.com/comics/[^"]+)')) + prevSearch = compile(tagre("a", "href", r'(http://www\.loadingartist\.com/\d+/\d+/\d+/[^"]+/)', after="prev")) + help = 'Index format: yyyy/mm/dd/stripname' + + class LookingForGroup(_BasicScraper): url = 'http://www.lfgcomic.com/' stripUrl = url + 'page/%s/' @@ -33,6 +42,7 @@ class LookingForGroup(_BasicScraper): nameSearch = compile(r'/page/(\d+)/') help = 'Index format: nnn' + @classmethod def namer(self, imageUrl, pageUrl): return self.nameSearch.search(pageUrl).group(1) diff --git a/dosagelib/plugins/n.py b/dosagelib/plugins/n.py index 5133f76fd..a1e86f82f 100644 --- a/dosagelib/plugins/n.py +++ b/dosagelib/plugins/n.py @@ -69,6 +69,15 @@ class Nicky510(_BasicScraper): help = 'Index format: stripname' +class Nnewts(_BasicScraper): + url = 'http://nnewts.com/' + stripUrl = url + '%s/' + firstStripUrl = stripUrl % 'nnewts-page-1' + imageSearch = compile(tagre("img", "src", r'(http://nnewts\.com/newty/comics/[^"]+)')) + prevSearch = compile(tagre("a", "href", r'(http://nnewts\.com/(?:nnewts-)?page-\d+/)', after="navi-prev")) + help = 'Index format: page-number' + + class NoNeedForBushido(_BasicScraper): url = 'http://noneedforbushido.com/latest/' stripUrl = 'http://noneedforbushido.com/%s/' diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index 559f72de9..77f776d72 100644 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -83,6 +83,17 @@ class PeppermintSaga(_BasicScraper): help = 'Index format: number' +class PHDComics(_BasicScraper): + baseurl = 'http://phdcomics.com/' + url = baseurl + 'comics.php' + stripUrl = baseurl + 'comics/archive.php?comicid=%s' + firstStripUrl = stripUrl % '1' + imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd[^ ]+)', quote="")) + prevSearch = compile(tagre("a", "href", r'((?:comics/)?archive\.php\?comicid=\d+)', quote="") + + tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote="")) + help = 'Index format: number' + + class PicPakDog(_BasicScraper): url = 'http://www.picpak.net/' stripUrl = url + 'comic/%s/' @@ -117,6 +128,23 @@ class Pimpette(_BasicScraper): help = 'Index format: yyyymmdd' +class PokeyThePenguin(_BasicScraper): + baseurl = 'http://www.yellow5.com/pokey/archive/' + url = baseurl + 'index558.html' + stripUrl = baseurl + 'index%s.html' + firstStripUrl = stripUrl % '1' + imageSearch = compile(tagre("img", "src", r'(pokey\d+[^"]+)')) + multipleImagesPerStrip = True + help = 'Index format: number' + + def getPrevUrl(self, url, data, baseUrl): + """Decrease index.html number.""" + mo = compile(r"index(\d+)\.html").search(url) + num = int(mo.group(1)) - 1 + prefix = url.rsplit('/', 1)[0] + return "%s/index%d.html" % (prefix, num) + + class Precocious(_BasicScraper): url = 'http://www.precociouscomic.com/' starter = indirectStarter(url, diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index fb2cd06de..febffa32e 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -5,7 +5,7 @@ from re import compile, MULTILINE, IGNORECASE, sub from os.path import splitext from ..scraper import _BasicScraper -from ..helpers import indirectStarter +from ..helpers import indirectStarter, bounceStarter from ..util import tagre @@ -150,6 +150,31 @@ class SluggyFreelance(_BasicScraper): help = 'Index format: yymmdd' +class SnowFlame(_BasicScraper): + url = 'http://www.snowflamecomic.com/' + stripUrl = url + '?comic=snowflame-%s-%s' + firstStripUrl = stripUrl % ('01', '01') + imageSearch = compile(tagre("img", "src", r'(http://www\.snowflamecomic\.com/wp-content/uploads/\d+/\d+/[^"]+)')) + prevSearch = compile(tagre("span", "class", "mininav-prev") + + tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)')) + starter = bounceStarter(url, + compile(tagre("span", "class", "mininav-next") + + tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)'))) + help = 'Index format: chapter-page' + + def getStripIndexUrl(self, index): + return self.stripUrl % index.split('-') + + @classmethod + def namer(cls, imageUrl, pageUrl): + prefix, filename = imageUrl.rsplit('/', 1) + ro = compile(r'snowflame-([^-]+)-([^-]+)') + mo = ro.search(pageUrl) + chapter = mo.group(1) + page = mo.group(2) + return "%s-%s-%s" % (chapter, page, filename) + + class SodiumEyes(_BasicScraper): url = 'http://sodiumeyes.com/' stripUrl = url + '%s/' diff --git a/dosagelib/plugins/w.py b/dosagelib/plugins/w.py index 0b55a6261..633b00f36 100644 --- a/dosagelib/plugins/w.py +++ b/dosagelib/plugins/w.py @@ -57,6 +57,40 @@ class Wonderella(_BasicScraper): help = 'Index format: yyyy/mm/dd/name' +class WorldOfMrToast(_BasicScraper): + baseurl = 'http://www.theimaginaryworld.com/' + url = baseurl + 'mrTcomicA.html' + stripUrl = baseurl + '%s.html' + imageSearch = compile(tagre("img", "src", r'(comic[^"]+)')) + # list the archive links since there is no prev/next navigation + prevurls = ( + url, + baseurl + 'mrTcomicW02.html', + baseurl + 'mrTcomicW01.html', + baseurl + 'mrGcomic03.html', + baseurl + 'mrGcomic02.html', + baseurl + 'mrGcomic01.html', + baseurl + 'mrTcomicT05.html', + baseurl + 'mrTcomicT04.html', + baseurl + 'mrTcomicT03.html', + baseurl + 'mrTcomicT02.html', + baseurl + 'mrTcomicT01.html', + baseurl + 'mrTcomicIW3.html', + baseurl + 'mrTcomicIW2.html', + baseurl + 'mrTcomicIW1.html', + ) + firstStripUrl = prevurls[-1] + multipleImagesPerStrip = True + help = 'Index format: none' + + def getPrevUrl(self, url, data, baseUrl): + idx = self.prevurls.index(url) + try: + return self.prevurls[idx+1] + except IndexError: + return None + + class WotNow(_BasicScraper): url = 'http://shadowburn.binmode.com/wotnow/' stripUrl = url + 'comic.php?comic_id=%s' diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index df7367e2d..3d924ea37 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -33,9 +33,6 @@ class _BasicScraper(object): # if more than one image per URL is expected multipleImagesPerStrip = False - # set of URLs that have no image (eg. only a video link) - noImageUrls = set() - # set to False if previous URLs do not match the strip URL (ie. because of redirects) prevUrlMatchesStripUrl = True @@ -55,7 +52,7 @@ class _BasicScraper(object): """Initialize internal variables.""" self.urls = set() if indexes: - self.indexes = tuple(indexes) + self.indexes = tuple(sorted(indexes)) else: self.indexes = tuple() self.skippedUrls = set() @@ -66,7 +63,7 @@ class _BasicScraper(object): if not isinstance(other, _BasicScraper): return 1 # first, order by name - d = cmp(self.get_name(), other.get_name()) + d = cmp(self.getName(), other.getName()) if d != 0: return d # then by indexes @@ -74,65 +71,41 @@ class _BasicScraper(object): def __hash__(self): """Get hash value from name and index list.""" - return hash((self.get_name(), self.indexes)) + return hash((self.getName(), self.indexes)) - def getCurrentStrips(self): - """Get current comic strip.""" - msg = 'Retrieving the current strip' - if self.indexes: - msg += " for indexes %s" % self.indexes - out.info(msg+"...") - if self.indexes: - for index in self.indexes: - url = self.stripUrl % index - if url in self.noImageUrls: - self.skipUrl(url) - else: - yield self.getStrip(url) - - else: - url = self.getLatestUrl() - if url in self.noImageUrls: - self.skipUrl(url) - else: - yield self.getStrip(self.getLatestUrl()) - - def skipUrl(self, url): - """Document that an URL had no images.""" - out.info('Skipping URL %s without image' % url) - self.skippedUrls.add(url) - - def getStrip(self, url): - """Get comic strip for given URL.""" - data, baseUrl = getPageContent(url, self.session) - return self.getComicStrip(url, data, baseUrl) + def shouldSkipUrl(self, url): + """Determine if search for images in given URL should be skipped.""" + return False def getComicStrip(self, url, data, baseUrl): """Get comic strip downloader for given URL and data.""" imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch) imageUrls = set(map(self.imageUrlModifier, imageUrls)) if len(imageUrls) > 1 and not self.multipleImagesPerStrip: - out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern)) - return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session) + out.warn("found %d images instead of 1 at %s with %s" % (len(imageUrls), url, self.imageSearch.pattern)) + elif not imageUrls: + out.warn("found no images at %s with %s" % (url, self.imageSearch.pattern)) + return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session) - def getAllStrips(self, maxstrips=None): - """Get all comic strips.""" + def getStrips(self, maxstrips=None): + """Get comic strips.""" if maxstrips: - msg = 'Retrieving %d strips' % maxstrips + word = "strip" if maxstrips == 1 else "strips" + msg = 'Retrieving %d %s' % (maxstrips, word) else: msg = 'Retrieving all strips' if self.indexes: - msg += " for indexes %s" % self.indexes + if len(self.indexes) == 1: + msg += " for index %s" % self.indexes[0] + else: + msg += " for indexes %s" % self.indexes + urls = [self.getIndexStripUrl(index) for index in self.indexes] + else: + urls = [self.getLatestUrl()] if self.adult: msg += " (including adult content)" out.info(msg) - if self.indexes: - for index in self.indexes: - url = self.stripUrl % index - for strip in self.getStripsFor(url, maxstrips): - yield strip - else: - url = self.getLatestUrl() + for url in urls: for strip in self.getStripsFor(url, maxstrips): yield strip @@ -142,42 +115,49 @@ class _BasicScraper(object): self.hitFirstStripUrl = False seen_urls = set() while url: + out.info('Get strip URL %s' % url, level=1) data, baseUrl = getPageContent(url, self.session) - if url in self.noImageUrls: - self.skipUrl(url) + if self.shouldSkipUrl(url): + out.info('Skipping URL %s' % url) + self.skippedUrls.add(url) else: yield self.getComicStrip(url, data, baseUrl) if self.firstStripUrl == url: out.debug("Stop at first URL %s" % url) self.hitFirstStripUrl = True break - prevUrl = None - if self.prevSearch: - try: - prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch) - except ValueError as msg: - # assume there is no previous URL, but print a warning - out.warn("%s Assuming no previous comic strips exist." % msg) - else: - prevUrl = self.prevUrlModifier(prevUrl) - out.debug("Matched previous URL %s" % prevUrl) + if maxstrips is not None: + maxstrips -= 1 + if maxstrips <= 0: + break + prevUrl = self.getPrevUrl(url, data, baseUrl) seen_urls.add(url) if prevUrl in seen_urls: # avoid recursive URL loops out.warn("Already seen previous URL %r" % prevUrl) break url = prevUrl - if maxstrips is not None: - maxstrips -= 1 - if maxstrips <= 0: - break - def setStrip(self, index): - """Set current comic strip URL.""" - self.currentUrl = self.stripUrl % index + def getPrevUrl(self, url, data, baseUrl): + """Find previous URL.""" + prevUrl = None + if self.prevSearch: + try: + prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch) + except ValueError as msg: + # assume there is no previous URL, but print a warning + out.warn("%s Assuming no previous comic strips exist." % msg) + else: + prevUrl = self.prevUrlModifier(prevUrl) + out.debug("Matched previous URL %s" % prevUrl) + return prevUrl + + def getIndexStripUrl(self, index): + """Get comic strip URL from index.""" + return self.stripUrl % index @classmethod - def get_name(cls): + def getName(cls): """Get scraper name.""" if hasattr(cls, 'name'): return cls.name @@ -209,10 +189,6 @@ class _BasicScraper(object): """ return imageUrl - def getFilename(self, imageUrl, pageUrl): - """Return filename for given image and page URL.""" - return self.namer(imageUrl, pageUrl) - def getLatestUrl(self): """Get starter URL from where to scrape comic strips.""" return self.starter() @@ -227,7 +203,7 @@ def find_scraperclasses(comic, multiple_allowed=False): candidates = [] cname = comic.lower() for scraperclass in get_scraperclasses(): - lname = scraperclass.get_name().lower() + lname = scraperclass.getName().lower() if lname == cname: # perfect match if not multiple_allowed: @@ -237,7 +213,7 @@ def find_scraperclasses(comic, multiple_allowed=False): elif cname in lname: candidates.append(scraperclass) if len(candidates) > 1 and not multiple_allowed: - comics = ", ".join(x.get_name() for x in candidates) + comics = ", ".join(x.getName() for x in candidates) raise ValueError('multiple comics found: %s' % comics) elif not candidates: raise ValueError('comic %r not found' % comic) @@ -266,10 +242,10 @@ def check_scrapers(): """Check for duplicate scraper class names.""" d = {} for scraperclass in _scraperclasses: - name = scraperclass.get_name().lower() + name = scraperclass.getName().lower() if name in d: - name1 = scraperclass.get_name() - name2 = d[name].get_name() + name1 = scraperclass.getName() + name2 = d[name].getName() raise ValueError('duplicate scrapers %s and %s found' % (name1, name2)) d[name] = scraperclass diff --git a/scripts/arcamax.py b/scripts/arcamax.py index 8e0b6f4b2..e2ace6d09 100755 --- a/scripts/arcamax.py +++ b/scripts/arcamax.py @@ -65,7 +65,7 @@ def has_comic(name): ("SmackJeeves/%s" % name).lower(), ] for scraperclass in get_scraperclasses(): - lname = scraperclass.get_name().lower() + lname = scraperclass.getName().lower() if lname in names: return True return False diff --git a/scripts/comicfury.py b/scripts/comicfury.py index dbd2a4b5d..0103b6ace 100755 --- a/scripts/comicfury.py +++ b/scripts/comicfury.py @@ -275,7 +275,7 @@ def has_comic(name): ("Arcamax/%s" % name).lower(), ] for scraperclass in get_scraperclasses(): - lname = scraperclass.get_name().lower() + lname = scraperclass.getName().lower() if lname in names: return True return False diff --git a/scripts/gocomics.py b/scripts/gocomics.py index e5e48f213..455b95d56 100755 --- a/scripts/gocomics.py +++ b/scripts/gocomics.py @@ -83,7 +83,7 @@ def has_creators_comic(name): """Test if comic name already exists.""" cname = "Creators/%s" % name for scraperclass in get_scraperclasses(): - lname = scraperclass.get_name().lower() + lname = scraperclass.getName().lower() if lname == cname.lower(): return True return False diff --git a/scripts/keenspot.py b/scripts/keenspot.py index 6f22a4d8f..29901e024 100755 --- a/scripts/keenspot.py +++ b/scripts/keenspot.py @@ -407,7 +407,7 @@ def has_comic(name): cname = ("Creators/%s" % name).lower() gname = ("GoComics/%s" % name).lower() for scraperclass in get_scraperclasses(): - lname = scraperclass.get_name().lower() + lname = scraperclass.getName().lower() if lname == cname or lname == gname: return True return False diff --git a/scripts/smackjeeves.py b/scripts/smackjeeves.py index eccc8a2c6..953b7959f 100755 --- a/scripts/smackjeeves.py +++ b/scripts/smackjeeves.py @@ -291,7 +291,7 @@ def has_comic(name): """Check if comic name already exists.""" cname = name.lower() for scraperclass in get_scraperclasses(): - lname = scraperclass.get_name().lower() + lname = scraperclass.getName().lower() if lname == cname: return True return False diff --git a/setup.py b/setup.py index fe2ae4691..24cbc401a 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,5 @@ #!/usr/bin/env python # -*- coding: iso-8859-1 -*- -# Dosage, the webcomic downloader # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2013 Bastian Kleineidam from __future__ import print_function @@ -394,7 +393,8 @@ class MyRegister (register, object): args = dict( name = AppName, version = AppVersion, - description = 'a commandline webcomic downloader and archiver', + description = 'a commandline comic downloader and archiver', + keywords = 'comic,webcomic,downloader,archiver', author = 'Tristan Seligmann, Jonathan Jacobs, Bastian Kleineidam', author_email = 'bastian.kleineidam@web.de', maintainer = 'Bastian Kleineidam', diff --git a/tests/test_comicnames.py b/tests/test_comicnames.py index 8aea08b16..932729c82 100644 --- a/tests/test_comicnames.py +++ b/tests/test_comicnames.py @@ -8,7 +8,7 @@ class TestComicNames(TestCase): def test_names(self): for scraperclass in scraper.get_scraperclasses(): - name = scraperclass.get_name() + name = scraperclass.getName() self.assertTrue(name.count('/') <= 1, name) if '/' in name: comicname = name.split('/')[1] diff --git a/tests/test_comics.py b/tests/test_comics.py index fbc3bdf28..c250c96b5 100644 --- a/tests/test_comics.py +++ b/tests/test_comics.py @@ -34,7 +34,7 @@ class _ComicTester(TestCase): scraperclass=None def setUp(self): - self.name = self.scraperclass.get_name() + self.name = self.scraperclass.getName() self.url = self.scraperclass.starter() # create a temporary directory for images self.tmpdir = tempfile.mkdtemp() @@ -64,7 +64,7 @@ class _ComicTester(TestCase): def _test_comic(self, scraperobj): num = 0 max_strips = 5 - for strip in islice(scraperobj.getAllStrips(), 0, max_strips): + for strip in scraperobj.getStrips(max_strips): images = [] for image in strip.getImages(): images.append(image.url) @@ -122,7 +122,7 @@ def generate_comic_testers(): g = globals() if "TRAVIS" in os.environ: # Get limited number of scraper tests on Travis builds. - max_scrapers = 1500 + max_scrapers = 500 scraperclasses = islice(scraper.get_scraperclasses(), 0, max_scrapers) else: scraperclasses = scraper.get_scraperclasses() diff --git a/tests/test_dosage.py b/tests/test_dosage.py index d4440b085..6b6ceea64 100644 --- a/tests/test_dosage.py +++ b/tests/test_dosage.py @@ -52,6 +52,11 @@ class TestDosage (unittest.TestCase): self.assertRaises(OSError, run_with_options, []) self.assertRaises(OSError, run_with_options, ['--imadoofus']) - def test_fetch(self): + def test_fetch_html(self): run_with_options(["-n", "2", "-b", self.tmpdir, "-o", "html", "calvinandhobbes"]) + + def test_fetch_rss(self): run_with_options(["--numstrips", "2", "--baseurl", "bla", "--basepath", self.tmpdir, "--output", "rss", "--adult", "sexyloser"]) + + def test_fetch_indexed(self): + run_with_options(["-n", "2", "-b", self.tmpdir, "calvinandhobbes:2012/02/02"])