Added some comic strips and cleanup the scraper code.
This commit is contained in:
parent
6091138481
commit
bae2a96d8b
31 changed files with 296 additions and 128 deletions
|
@ -1,14 +1,14 @@
|
|||
Dosage
|
||||
=======
|
||||
|
||||
Dosage is a commandline webcomic downloader and archiver.
|
||||
Dosage is a commandline comic downloader and archiver.
|
||||
|
||||
Introduction
|
||||
-------------
|
||||
Dosage is designed to keep a local copy of specific webcomics
|
||||
and other picture-based content such as Picture of the Day sites.
|
||||
With the dosage commandline script you can get the latest strip of
|
||||
webcomic, or catch-up to the last strip downloaded, or download a
|
||||
a webcomic, or catch-up to the last strip downloaded, or download a
|
||||
strip for a particular date/index (if the webcomic's site layout
|
||||
makes this possible).
|
||||
|
||||
|
@ -91,7 +91,7 @@ Technical Description
|
|||
Dosage is written in Python and relies on regular expressions to
|
||||
do most of the grunt work.
|
||||
|
||||
For each webcomic Dosage has a plugin module, found in the "plugins"
|
||||
For each comic Dosage has a plugin module, found in the "plugins"
|
||||
subdirectory of the dosagelib directory. Each module is a subclass of
|
||||
the _BasicComic class and specifies where to download its comic images.
|
||||
Some comic syndicates (GoComics for example) have a standard layout for all
|
||||
|
@ -100,7 +100,7 @@ instances from a given list of comic strips.
|
|||
|
||||
Extending Dosage
|
||||
-----------------
|
||||
In order to add a new webcomic, a new module class has to be created in
|
||||
In order to add a new comic, a new module class has to be created in
|
||||
one of the *.py files in the dosagelib/plugins subdirectory.
|
||||
Look at the existing module classes for examples.
|
||||
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
Dosage 1.13 (released xx.xx.2013)
|
||||
|
||||
Features:
|
||||
- comics: Added comic strips AxeCop, Bearmageddon, DeadWinter,
|
||||
HarkAVagrant, IAmArg, LoadingArtist, Nnewts, PHDComics, PokeyThePenguin,
|
||||
SnowFlame and WorldOfMrToast.
|
||||
|
||||
Fixes:
|
||||
- cmdline: Catch error when piping output to another
|
||||
program or file under Windows.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
.TH DOSAGE 1
|
||||
.SH NAME
|
||||
dosage \- a commandline webcomic downloader and archiver
|
||||
dosage \- a commandline comic downloader and archiver
|
||||
.SH SYNOPSIS
|
||||
\fBdosage\fP [\fIoptions\fP] \fImodule\fP...
|
||||
.SH DESCRIPTION
|
||||
|
@ -128,7 +128,7 @@ Retrieve the Penny Arcade strip for a given index:
|
|||
Retrieve Calvin and Hobbes strips from a given index going backwards to
|
||||
the beginning.
|
||||
.RS
|
||||
.B dosage \-a calvinandhobbes:20120722
|
||||
.B dosage \-a calvinandhobbes:2012/07/22
|
||||
.RE
|
||||
.PP
|
||||
On Unix, \fBxargs(1)\fP can download several comic strips in parallel,
|
||||
|
|
|
@ -9,7 +9,7 @@ Section: User Commands (1)<BR><A HREF="#index">Index</A>
|
|||
<A NAME="lbAB"> </A>
|
||||
<H2>NAME</H2>
|
||||
|
||||
dosage - a commandline webcomic downloader and archiver
|
||||
dosage - a commandline comic downloader and archiver
|
||||
<A NAME="lbAC"> </A>
|
||||
<H2>SYNOPSIS</H2>
|
||||
|
||||
|
@ -174,7 +174,7 @@ Retrieve the Penny Arcade strip for a given index:
|
|||
Retrieve Calvin and Hobbes strips from a given index going backwards to
|
||||
the beginning.
|
||||
<DL COMPACT><DT><DD>
|
||||
<B>dosage -a calvinandhobbes:20120722</B>
|
||||
<B>dosage -a calvinandhobbes:2012/07/22</B>
|
||||
|
||||
</DL>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ DOSAGE(1) DOSAGE(1)
|
|||
|
||||
|
||||
NAME
|
||||
dosage - a commandline webcomic downloader and archiver
|
||||
dosage - a commandline comic downloader and archiver
|
||||
|
||||
SYNOPSIS
|
||||
dosage [options] module...
|
||||
|
@ -116,7 +116,7 @@ EXAMPLES
|
|||
|
||||
Retrieve Calvin and Hobbes strips from a given index going
|
||||
backwards to the beginning.
|
||||
dosage -a calvinandhobbes:20120722
|
||||
dosage -a calvinandhobbes:2012/07/22
|
||||
|
||||
On Unix, xargs(1) can download several comic strips in paral‐
|
||||
lel, for example using up to 4 processes:
|
||||
|
|
24
dosage
24
dosage
|
@ -1,6 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: iso-8859-1 -*-
|
||||
# Dosage, the webcomic downloader
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||
from __future__ import print_function
|
||||
|
@ -56,7 +55,7 @@ def setupOptions():
|
|||
@rtype argparse.ArgumentParser
|
||||
"""
|
||||
kwargs = dict(
|
||||
description = "A commandline webcomic downloader and archiver.",
|
||||
description = "A commandline comic downloader and archiver.",
|
||||
epilog = Examples,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
|
@ -131,7 +130,7 @@ def displayHelp(comics):
|
|||
|
||||
def displayComicHelp(scraperobj):
|
||||
"""Print description and help for a comic."""
|
||||
out.context = scraperobj.get_name()
|
||||
out.context = scraperobj.getName()
|
||||
try:
|
||||
if scraperobj.description:
|
||||
out.info("Description: " + scraperobj.description)
|
||||
|
@ -163,15 +162,16 @@ def getComics(options):
|
|||
def getStrips(scraperobj, options):
|
||||
"""Get all strips from a scraper."""
|
||||
errors = 0
|
||||
out.context = scraperobj.get_name()
|
||||
out.context = scraperobj.getName()
|
||||
if options.all:
|
||||
strips = scraperobj.getAllStrips()
|
||||
numstrips = None
|
||||
elif options.numstrips:
|
||||
strips = scraperobj.getAllStrips(options.numstrips)
|
||||
numstrips = options.numstrips
|
||||
else:
|
||||
strips = scraperobj.getCurrentStrips()
|
||||
# get current strip
|
||||
numstrips = 1
|
||||
try:
|
||||
for strip in strips:
|
||||
for strip in scraperobj.getStrips(numstrips):
|
||||
_errors, skipped = saveComicStrip(strip, options.basepath)
|
||||
errors += _errors
|
||||
if skipped and options.cont:
|
||||
|
@ -206,7 +206,7 @@ def doList(columnList=True):
|
|||
"""List available comics."""
|
||||
out.info('Available comic scrapers:')
|
||||
out.info('Comics marked with [A] require age confirmation with the --adult option.')
|
||||
scrapers = sorted(getScrapers(['@@']), key=lambda s: s.get_name())
|
||||
scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName())
|
||||
try:
|
||||
if columnList:
|
||||
num = doColumnList(scrapers)
|
||||
|
@ -243,7 +243,7 @@ def doColumnList(scrapers):
|
|||
def getScraperName(scraperobj, limit=None):
|
||||
"""Get comic scraper name."""
|
||||
suffix = " [A]" if scraperobj.adult else ""
|
||||
name = scraperobj.get_name()
|
||||
name = scraperobj.getName()
|
||||
if limit is not None:
|
||||
name = strlimit(name, limit)
|
||||
return name + suffix
|
||||
|
@ -259,7 +259,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
|
|||
if not adult and scraperclass.adult:
|
||||
warn_adult(scraperclass)
|
||||
continue
|
||||
dirname = getDirname(scraperclass.get_name())
|
||||
dirname = getDirname(scraperclass.getName())
|
||||
if os.path.isdir(os.path.join(basepath, dirname)):
|
||||
yield scraperclass()
|
||||
elif '@@' in comics:
|
||||
|
@ -293,7 +293,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
|
|||
|
||||
def warn_adult(scraperclass):
|
||||
"""Print warning about adult content."""
|
||||
out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.get_name())
|
||||
out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||
"""
|
||||
Automated webcomic downloader. Dosage traverses webcomic websites in
|
||||
Automated comic downloader. Dosage traverses comic websites in
|
||||
order to download each strip of the comic. The intended use is for
|
||||
mirroring the strips locally for ease of viewing; redistribution of the
|
||||
downloaded strips may violate copyright, and is not advisable unless you
|
||||
|
@ -11,7 +11,7 @@ your intentions, and received permission to distribute.
|
|||
|
||||
The primary dosage interface is currently the 'mainline' script, which
|
||||
is just a thin wrapper that invokes L{dosage.mainline}. Comic modules
|
||||
for each webcomic are located in L{dosage.modules}; most of these make
|
||||
for each comic are located in L{dosage.modules}; most of these make
|
||||
use of the helper base classes and mixins in L{dosage.modules.helpers},
|
||||
thus making their individual implementations trivial.
|
||||
"""
|
||||
|
|
|
@ -88,6 +88,7 @@ class ComicImage(object):
|
|||
|
||||
def save(self, basepath):
|
||||
"""Save comic URL to filename on disk."""
|
||||
out.info("Get image URL %s" % self.url, level=1)
|
||||
self.connect()
|
||||
filename = "%s%s" % (self.filename, self.ext)
|
||||
comicSize = self.contentLength
|
||||
|
@ -96,6 +97,7 @@ class ComicImage(object):
|
|||
os.makedirs(comicDir)
|
||||
|
||||
fn = os.path.join(comicDir, filename)
|
||||
# compare with >= since comicSize could be the compressed size
|
||||
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
|
||||
self.touch(fn)
|
||||
out.info('Skipping existing file "%s".' % fn)
|
||||
|
|
|
@ -159,14 +159,17 @@ class AstronomyPOTD(_BasicScraper):
|
|||
stripUrl = 'http://antwrp.gsfc.nasa.gov/apod/ap%s.html'
|
||||
imageSearch = compile(tagre("a", "href", r'(image/\d{4}/[^"]+)'))
|
||||
multipleImagesPerStrip = True
|
||||
noImageUrls = set([
|
||||
'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video
|
||||
'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video
|
||||
'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video
|
||||
])
|
||||
prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "<</a>")
|
||||
help = 'Index format: yymmdd'
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
"""Skip pages without images."""
|
||||
return url in (
|
||||
'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video
|
||||
'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video
|
||||
'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
return '%s-%s' % (pageUrl.split('/')[-1].split('.')[0][2:],
|
||||
|
@ -269,3 +272,14 @@ class Annyseed(_BasicScraper):
|
|||
imageSearch = compile(tagre("img", "src", r'(Annyseed[^"]+)'))
|
||||
prevSearch = compile(r'<a href="(http://www\.colourofivy\.com/[^"]+)"><img src="Last.gif"')
|
||||
help = 'Index format: nnn'
|
||||
|
||||
|
||||
class AxeCop(_BasicScraper):
|
||||
url = 'http://axecop.com/'
|
||||
starter = indirectStarter(url, compile(tagre("a", "href", r'(http://axecop\.com/index\.php/acepisodes/read/episode_\d+/)')))
|
||||
stripUrl = url + 'index.php/acepisodes/read/episode_%s/'
|
||||
firstStripUrl = stripUrl % '0'
|
||||
imageSearch = compile(tagre("img", "src", r'(http://axecop\.com/images/uploads/axecop[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'(http://axecop\.com/index\.php/acepisodes/read/episode_\d+/)') +
|
||||
tagre("img", "src", r'http://axecop\.com/acimages/buttons/page_left\.png'))
|
||||
help = 'Index format: number'
|
||||
|
|
|
@ -25,6 +25,15 @@ class Bardsworth(_BasicScraper):
|
|||
help = 'Index format: nnn'
|
||||
|
||||
|
||||
class Bearmageddon(_BasicScraper):
|
||||
url = 'http://bearmageddon.com/'
|
||||
stripUrl = url + '%s/'
|
||||
firstStripUrl = stripUrl % '2011/08/01/page-1'
|
||||
imageSearch = compile(tagre("img", "src", r'(http://bearmageddon\.com/comics/[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'(http://bearmageddon\.com/\d+/\d+/\d+/[^"]+)', after='navi-prev'))
|
||||
help = 'Index format: yyyy/mm/dd/stripname'
|
||||
|
||||
|
||||
class BetterDays(_BasicScraper):
|
||||
url = 'http://jaynaylor.com/betterdays/'
|
||||
stripUrl = url + 'archives/%s.html'
|
||||
|
@ -119,6 +128,16 @@ class BrentalFlossGuest(BrentalFloss):
|
|||
stripUrl = url + '?id=%s'
|
||||
|
||||
|
||||
# XXX disallowed by robots.txt
|
||||
class _BringBackRoomies(_BasicScraper):
|
||||
url = "http://www.bringbackroomies.com/"
|
||||
stripUrl = url + "comic/%s"
|
||||
imageSearch = compile(tagre("img", "src", r'(http://www\.bringbackroomies\.com/wp-content/uploads/\d+/\d+/[^"]+)'))
|
||||
prevSearch = compile(tagre("span", "class", "mininav-prev") +
|
||||
tagre("a", "href", r'(http://www\.bringbackroomies\.com/comic/[^"]+)'))
|
||||
help = 'Index format: stripname'
|
||||
|
||||
|
||||
class Brink(_BasicScraper):
|
||||
url = 'http://paperfangs.com/brink/'
|
||||
stripUrl = url + '?p=%s'
|
||||
|
|
|
@ -209,9 +209,12 @@ class CyanideAndHappiness(_BasicScraper):
|
|||
stripUrl = url + '%s/'
|
||||
imageSearch = compile(tagre("img", "src", r'(http://(?:www\.)?explosm\.net/db/files/[^"]+)', before="a daily webcomic"))
|
||||
prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', before="prev"))
|
||||
noImageUrls = set(["http://www.explosm.net/comics/3082/"])
|
||||
help = 'Index format: n (unpadded)'
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
"""Skip pages without images."""
|
||||
return url == "http://www.explosm.net/comics/3082/"
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
imgname = imageUrl.split('/')[-1]
|
||||
|
|
|
@ -54,6 +54,15 @@ class DarkWings(_BasicScraper):
|
|||
help = 'Index format: yyyy/mm/dd/page-nn-mm'
|
||||
|
||||
|
||||
class DeadWinter(_BasicScraper):
|
||||
url = 'http://deadwinter.cc/'
|
||||
stripUrl = url + 'page/%s'
|
||||
firstStripUrl = stripUrl % '1'
|
||||
imageSearch = compile(tagre("img", "src", r"(/static/page/strip/\d+[^']+)", quote="'"))
|
||||
prevSearch = compile(tagre("a", "href", r'(/page/\d+)') + "Previous")
|
||||
help = 'Index format: number'
|
||||
|
||||
|
||||
class DeathToTheExtremist(_BasicScraper):
|
||||
url = 'http://www.dtecomic.com/'
|
||||
stripUrl = url + '?n=%s'
|
||||
|
|
|
@ -44,8 +44,8 @@ class FilibusterCartoons(_BasicScraper):
|
|||
|
||||
class FirstWorldProblems(_BasicScraper):
|
||||
url = 'http://bradcolbow.com/archive/C5/'
|
||||
stripUrl = url + '%s'
|
||||
firstStripUrl = 'http://bradcolbow.com/archive/C5/P10/'
|
||||
stripUrl = url + '%s/'
|
||||
firstStripUrl = stripUrl % 'P10'
|
||||
imageSearch = compile(tagre("img", "src", r'(http://(?:fwpcomics\.s3\.amazonaws\.com|s3\.amazonaws\.com/fwpcomics)/s1-[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'(http://bradcolbow\.com/archive/C5/[^"]+)', before="prev"))
|
||||
multipleImagesPerStrip = True
|
||||
|
@ -126,9 +126,9 @@ class Fallen(_BasicScraper):
|
|||
part = pageUrl.split('-')[-1].split('.')[0]
|
||||
return '%s-%s' % (part, num)
|
||||
|
||||
def setStrip(self, index):
|
||||
def getIndexStripUrl(self, index):
|
||||
index, part = index.split('-')
|
||||
self.currentUrl = self.stripUrl % (part, index, part)
|
||||
return self.stripUrl % (part, index, part)
|
||||
|
||||
|
||||
class FredoAndPidjin(_BasicScraper):
|
||||
|
|
|
@ -3,6 +3,26 @@
|
|||
from re import compile
|
||||
from ..scraper import _BasicScraper
|
||||
from ..util import tagre
|
||||
from ..helpers import bounceStarter
|
||||
|
||||
|
||||
class HarkAVagrant(_BasicScraper):
|
||||
url = 'http://www.harkavagrant.com/'
|
||||
starter = bounceStarter(url,
|
||||
compile(tagre("a", "href", r'(http://www\.harkavagrant\.com/index\.php\?id=\d+)') +
|
||||
tagre("img", "src", "buttonnext.png")))
|
||||
stripUrl = url + 'index.php?id=%s'
|
||||
firstStripUrl = stripUrl % '1'
|
||||
imageSearch = compile(tagre("img", "src", r'(http://www.harkavagrant.com/[^"]+)', after='BORDER'))
|
||||
prevSearch = compile(tagre("a", "href", r'(http://www\.harkavagrant\.com/index\.php\?id=\d+)') +
|
||||
tagre("img", "src", "buttonprevious.png"))
|
||||
help = 'Index format: number'
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
filename = imageUrl.rsplit('/', 1)[1]
|
||||
num = pageUrl.rsplit('=', 1)[1]
|
||||
return '%s-%s' % (num, filename)
|
||||
|
||||
|
||||
class HijinksEnsue(_BasicScraper):
|
||||
|
|
|
@ -7,6 +7,15 @@ from ..scraper import _BasicScraper
|
|||
from ..util import tagre
|
||||
|
||||
|
||||
class IAmArg(_BasicScraper):
|
||||
url = 'http://iamarg.com/'
|
||||
stripUrl = url + '%s/'
|
||||
firstStripUrl = stripUrl % '2011/05/08/05082011'
|
||||
imageSearch = compile(tagre("img", "src", r'(http://iamarg\.com/comics/\d+-\d+-\d+[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'(http://iamarg\.com/\d+/\d+/\d+/[^"]+)', after="prev"))
|
||||
help = 'Index format: yyyy/mm/dd/stripname'
|
||||
|
||||
|
||||
class IanJay(_BasicScraper):
|
||||
url = 'http://ianjay.net/'
|
||||
stripUrl = url + '?p=%s'
|
||||
|
|
|
@ -37,8 +37,8 @@ class KevinAndKell(_BasicScraper):
|
|||
prevSearch = compile(r'<a.+?href="(/?(\.\./)?\d+/kk\d+\.html)"[^>]*><span>Previous Strip', IGNORECASE)
|
||||
help = 'Index format: yyyy-mm-dd'
|
||||
|
||||
def setStrip(self, index):
|
||||
self.currentUrl = self.stripUrl % tuple(map(int, index.split('-')))
|
||||
def getIndexStripUrl(self, index):
|
||||
return self.stripUrl % tuple(map(int, index.split('-')))
|
||||
|
||||
|
||||
class KhaosKomix(_BasicScraper):
|
||||
|
|
|
@ -24,6 +24,15 @@ class Lint(_BasicScraper):
|
|||
help = 'Index format: yyyy/mm/dd/num-name'
|
||||
|
||||
|
||||
class LoadingArtist(_BasicScraper):
|
||||
url = 'http://www.loadingartist.com/'
|
||||
stripUrl = url + '%s/'
|
||||
firstStripUrl = stripUrl % '2011/01/04/born'
|
||||
imageSearch = compile(tagre("img", "src", r'(http://www\.loadingartist\.com/comics/[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'(http://www\.loadingartist\.com/\d+/\d+/\d+/[^"]+/)', after="prev"))
|
||||
help = 'Index format: yyyy/mm/dd/stripname'
|
||||
|
||||
|
||||
class LookingForGroup(_BasicScraper):
|
||||
url = 'http://www.lfgcomic.com/'
|
||||
stripUrl = url + 'page/%s/'
|
||||
|
@ -33,6 +42,7 @@ class LookingForGroup(_BasicScraper):
|
|||
nameSearch = compile(r'/page/(\d+)/')
|
||||
help = 'Index format: nnn'
|
||||
|
||||
@classmethod
|
||||
def namer(self, imageUrl, pageUrl):
|
||||
return self.nameSearch.search(pageUrl).group(1)
|
||||
|
||||
|
|
|
@ -69,6 +69,15 @@ class Nicky510(_BasicScraper):
|
|||
help = 'Index format: stripname'
|
||||
|
||||
|
||||
class Nnewts(_BasicScraper):
|
||||
url = 'http://nnewts.com/'
|
||||
stripUrl = url + '%s/'
|
||||
firstStripUrl = stripUrl % 'nnewts-page-1'
|
||||
imageSearch = compile(tagre("img", "src", r'(http://nnewts\.com/newty/comics/[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'(http://nnewts\.com/(?:nnewts-)?page-\d+/)', after="navi-prev"))
|
||||
help = 'Index format: page-number'
|
||||
|
||||
|
||||
class NoNeedForBushido(_BasicScraper):
|
||||
url = 'http://noneedforbushido.com/latest/'
|
||||
stripUrl = 'http://noneedforbushido.com/%s/'
|
||||
|
|
|
@ -83,6 +83,17 @@ class PeppermintSaga(_BasicScraper):
|
|||
help = 'Index format: number'
|
||||
|
||||
|
||||
class PHDComics(_BasicScraper):
|
||||
baseurl = 'http://phdcomics.com/'
|
||||
url = baseurl + 'comics.php'
|
||||
stripUrl = baseurl + 'comics/archive.php?comicid=%s'
|
||||
firstStripUrl = stripUrl % '1'
|
||||
imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd[^ ]+)', quote=""))
|
||||
prevSearch = compile(tagre("a", "href", r'((?:comics/)?archive\.php\?comicid=\d+)', quote="") +
|
||||
tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
|
||||
help = 'Index format: number'
|
||||
|
||||
|
||||
class PicPakDog(_BasicScraper):
|
||||
url = 'http://www.picpak.net/'
|
||||
stripUrl = url + 'comic/%s/'
|
||||
|
@ -117,6 +128,23 @@ class Pimpette(_BasicScraper):
|
|||
help = 'Index format: yyyymmdd'
|
||||
|
||||
|
||||
class PokeyThePenguin(_BasicScraper):
|
||||
baseurl = 'http://www.yellow5.com/pokey/archive/'
|
||||
url = baseurl + 'index558.html'
|
||||
stripUrl = baseurl + 'index%s.html'
|
||||
firstStripUrl = stripUrl % '1'
|
||||
imageSearch = compile(tagre("img", "src", r'(pokey\d+[^"]+)'))
|
||||
multipleImagesPerStrip = True
|
||||
help = 'Index format: number'
|
||||
|
||||
def getPrevUrl(self, url, data, baseUrl):
|
||||
"""Decrease index.html number."""
|
||||
mo = compile(r"index(\d+)\.html").search(url)
|
||||
num = int(mo.group(1)) - 1
|
||||
prefix = url.rsplit('/', 1)[0]
|
||||
return "%s/index%d.html" % (prefix, num)
|
||||
|
||||
|
||||
class Precocious(_BasicScraper):
|
||||
url = 'http://www.precociouscomic.com/'
|
||||
starter = indirectStarter(url,
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
from re import compile, MULTILINE, IGNORECASE, sub
|
||||
from os.path import splitext
|
||||
from ..scraper import _BasicScraper
|
||||
from ..helpers import indirectStarter
|
||||
from ..helpers import indirectStarter, bounceStarter
|
||||
from ..util import tagre
|
||||
|
||||
|
||||
|
@ -150,6 +150,31 @@ class SluggyFreelance(_BasicScraper):
|
|||
help = 'Index format: yymmdd'
|
||||
|
||||
|
||||
class SnowFlame(_BasicScraper):
|
||||
url = 'http://www.snowflamecomic.com/'
|
||||
stripUrl = url + '?comic=snowflame-%s-%s'
|
||||
firstStripUrl = stripUrl % ('01', '01')
|
||||
imageSearch = compile(tagre("img", "src", r'(http://www\.snowflamecomic\.com/wp-content/uploads/\d+/\d+/[^"]+)'))
|
||||
prevSearch = compile(tagre("span", "class", "mininav-prev") +
|
||||
tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)'))
|
||||
starter = bounceStarter(url,
|
||||
compile(tagre("span", "class", "mininav-next") +
|
||||
tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)')))
|
||||
help = 'Index format: chapter-page'
|
||||
|
||||
def getStripIndexUrl(self, index):
|
||||
return self.stripUrl % index.split('-')
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
prefix, filename = imageUrl.rsplit('/', 1)
|
||||
ro = compile(r'snowflame-([^-]+)-([^-]+)')
|
||||
mo = ro.search(pageUrl)
|
||||
chapter = mo.group(1)
|
||||
page = mo.group(2)
|
||||
return "%s-%s-%s" % (chapter, page, filename)
|
||||
|
||||
|
||||
class SodiumEyes(_BasicScraper):
|
||||
url = 'http://sodiumeyes.com/'
|
||||
stripUrl = url + '%s/'
|
||||
|
|
|
@ -57,6 +57,40 @@ class Wonderella(_BasicScraper):
|
|||
help = 'Index format: yyyy/mm/dd/name'
|
||||
|
||||
|
||||
class WorldOfMrToast(_BasicScraper):
|
||||
baseurl = 'http://www.theimaginaryworld.com/'
|
||||
url = baseurl + 'mrTcomicA.html'
|
||||
stripUrl = baseurl + '%s.html'
|
||||
imageSearch = compile(tagre("img", "src", r'(comic[^"]+)'))
|
||||
# list the archive links since there is no prev/next navigation
|
||||
prevurls = (
|
||||
url,
|
||||
baseurl + 'mrTcomicW02.html',
|
||||
baseurl + 'mrTcomicW01.html',
|
||||
baseurl + 'mrGcomic03.html',
|
||||
baseurl + 'mrGcomic02.html',
|
||||
baseurl + 'mrGcomic01.html',
|
||||
baseurl + 'mrTcomicT05.html',
|
||||
baseurl + 'mrTcomicT04.html',
|
||||
baseurl + 'mrTcomicT03.html',
|
||||
baseurl + 'mrTcomicT02.html',
|
||||
baseurl + 'mrTcomicT01.html',
|
||||
baseurl + 'mrTcomicIW3.html',
|
||||
baseurl + 'mrTcomicIW2.html',
|
||||
baseurl + 'mrTcomicIW1.html',
|
||||
)
|
||||
firstStripUrl = prevurls[-1]
|
||||
multipleImagesPerStrip = True
|
||||
help = 'Index format: none'
|
||||
|
||||
def getPrevUrl(self, url, data, baseUrl):
|
||||
idx = self.prevurls.index(url)
|
||||
try:
|
||||
return self.prevurls[idx+1]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
|
||||
class WotNow(_BasicScraper):
|
||||
url = 'http://shadowburn.binmode.com/wotnow/'
|
||||
stripUrl = url + 'comic.php?comic_id=%s'
|
||||
|
|
|
@ -33,9 +33,6 @@ class _BasicScraper(object):
|
|||
# if more than one image per URL is expected
|
||||
multipleImagesPerStrip = False
|
||||
|
||||
# set of URLs that have no image (eg. only a video link)
|
||||
noImageUrls = set()
|
||||
|
||||
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
||||
prevUrlMatchesStripUrl = True
|
||||
|
||||
|
@ -55,7 +52,7 @@ class _BasicScraper(object):
|
|||
"""Initialize internal variables."""
|
||||
self.urls = set()
|
||||
if indexes:
|
||||
self.indexes = tuple(indexes)
|
||||
self.indexes = tuple(sorted(indexes))
|
||||
else:
|
||||
self.indexes = tuple()
|
||||
self.skippedUrls = set()
|
||||
|
@ -66,7 +63,7 @@ class _BasicScraper(object):
|
|||
if not isinstance(other, _BasicScraper):
|
||||
return 1
|
||||
# first, order by name
|
||||
d = cmp(self.get_name(), other.get_name())
|
||||
d = cmp(self.getName(), other.getName())
|
||||
if d != 0:
|
||||
return d
|
||||
# then by indexes
|
||||
|
@ -74,65 +71,41 @@ class _BasicScraper(object):
|
|||
|
||||
def __hash__(self):
|
||||
"""Get hash value from name and index list."""
|
||||
return hash((self.get_name(), self.indexes))
|
||||
return hash((self.getName(), self.indexes))
|
||||
|
||||
def getCurrentStrips(self):
|
||||
"""Get current comic strip."""
|
||||
msg = 'Retrieving the current strip'
|
||||
if self.indexes:
|
||||
msg += " for indexes %s" % self.indexes
|
||||
out.info(msg+"...")
|
||||
if self.indexes:
|
||||
for index in self.indexes:
|
||||
url = self.stripUrl % index
|
||||
if url in self.noImageUrls:
|
||||
self.skipUrl(url)
|
||||
else:
|
||||
yield self.getStrip(url)
|
||||
|
||||
else:
|
||||
url = self.getLatestUrl()
|
||||
if url in self.noImageUrls:
|
||||
self.skipUrl(url)
|
||||
else:
|
||||
yield self.getStrip(self.getLatestUrl())
|
||||
|
||||
def skipUrl(self, url):
|
||||
"""Document that an URL had no images."""
|
||||
out.info('Skipping URL %s without image' % url)
|
||||
self.skippedUrls.add(url)
|
||||
|
||||
def getStrip(self, url):
|
||||
"""Get comic strip for given URL."""
|
||||
data, baseUrl = getPageContent(url, self.session)
|
||||
return self.getComicStrip(url, data, baseUrl)
|
||||
def shouldSkipUrl(self, url):
|
||||
"""Determine if search for images in given URL should be skipped."""
|
||||
return False
|
||||
|
||||
def getComicStrip(self, url, data, baseUrl):
|
||||
"""Get comic strip downloader for given URL and data."""
|
||||
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
|
||||
imageUrls = set(map(self.imageUrlModifier, imageUrls))
|
||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
||||
return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
|
||||
out.warn("found %d images instead of 1 at %s with %s" % (len(imageUrls), url, self.imageSearch.pattern))
|
||||
elif not imageUrls:
|
||||
out.warn("found no images at %s with %s" % (url, self.imageSearch.pattern))
|
||||
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
|
||||
|
||||
def getAllStrips(self, maxstrips=None):
|
||||
"""Get all comic strips."""
|
||||
def getStrips(self, maxstrips=None):
|
||||
"""Get comic strips."""
|
||||
if maxstrips:
|
||||
msg = 'Retrieving %d strips' % maxstrips
|
||||
word = "strip" if maxstrips == 1 else "strips"
|
||||
msg = 'Retrieving %d %s' % (maxstrips, word)
|
||||
else:
|
||||
msg = 'Retrieving all strips'
|
||||
if self.indexes:
|
||||
msg += " for indexes %s" % self.indexes
|
||||
if len(self.indexes) == 1:
|
||||
msg += " for index %s" % self.indexes[0]
|
||||
else:
|
||||
msg += " for indexes %s" % self.indexes
|
||||
urls = [self.getIndexStripUrl(index) for index in self.indexes]
|
||||
else:
|
||||
urls = [self.getLatestUrl()]
|
||||
if self.adult:
|
||||
msg += " (including adult content)"
|
||||
out.info(msg)
|
||||
if self.indexes:
|
||||
for index in self.indexes:
|
||||
url = self.stripUrl % index
|
||||
for strip in self.getStripsFor(url, maxstrips):
|
||||
yield strip
|
||||
else:
|
||||
url = self.getLatestUrl()
|
||||
for url in urls:
|
||||
for strip in self.getStripsFor(url, maxstrips):
|
||||
yield strip
|
||||
|
||||
|
@ -142,42 +115,49 @@ class _BasicScraper(object):
|
|||
self.hitFirstStripUrl = False
|
||||
seen_urls = set()
|
||||
while url:
|
||||
out.info('Get strip URL %s' % url, level=1)
|
||||
data, baseUrl = getPageContent(url, self.session)
|
||||
if url in self.noImageUrls:
|
||||
self.skipUrl(url)
|
||||
if self.shouldSkipUrl(url):
|
||||
out.info('Skipping URL %s' % url)
|
||||
self.skippedUrls.add(url)
|
||||
else:
|
||||
yield self.getComicStrip(url, data, baseUrl)
|
||||
if self.firstStripUrl == url:
|
||||
out.debug("Stop at first URL %s" % url)
|
||||
self.hitFirstStripUrl = True
|
||||
break
|
||||
prevUrl = None
|
||||
if self.prevSearch:
|
||||
try:
|
||||
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
||||
except ValueError as msg:
|
||||
# assume there is no previous URL, but print a warning
|
||||
out.warn("%s Assuming no previous comic strips exist." % msg)
|
||||
else:
|
||||
prevUrl = self.prevUrlModifier(prevUrl)
|
||||
out.debug("Matched previous URL %s" % prevUrl)
|
||||
if maxstrips is not None:
|
||||
maxstrips -= 1
|
||||
if maxstrips <= 0:
|
||||
break
|
||||
prevUrl = self.getPrevUrl(url, data, baseUrl)
|
||||
seen_urls.add(url)
|
||||
if prevUrl in seen_urls:
|
||||
# avoid recursive URL loops
|
||||
out.warn("Already seen previous URL %r" % prevUrl)
|
||||
break
|
||||
url = prevUrl
|
||||
if maxstrips is not None:
|
||||
maxstrips -= 1
|
||||
if maxstrips <= 0:
|
||||
break
|
||||
|
||||
def setStrip(self, index):
|
||||
"""Set current comic strip URL."""
|
||||
self.currentUrl = self.stripUrl % index
|
||||
def getPrevUrl(self, url, data, baseUrl):
|
||||
"""Find previous URL."""
|
||||
prevUrl = None
|
||||
if self.prevSearch:
|
||||
try:
|
||||
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
||||
except ValueError as msg:
|
||||
# assume there is no previous URL, but print a warning
|
||||
out.warn("%s Assuming no previous comic strips exist." % msg)
|
||||
else:
|
||||
prevUrl = self.prevUrlModifier(prevUrl)
|
||||
out.debug("Matched previous URL %s" % prevUrl)
|
||||
return prevUrl
|
||||
|
||||
def getIndexStripUrl(self, index):
|
||||
"""Get comic strip URL from index."""
|
||||
return self.stripUrl % index
|
||||
|
||||
@classmethod
|
||||
def get_name(cls):
|
||||
def getName(cls):
|
||||
"""Get scraper name."""
|
||||
if hasattr(cls, 'name'):
|
||||
return cls.name
|
||||
|
@ -209,10 +189,6 @@ class _BasicScraper(object):
|
|||
"""
|
||||
return imageUrl
|
||||
|
||||
def getFilename(self, imageUrl, pageUrl):
|
||||
"""Return filename for given image and page URL."""
|
||||
return self.namer(imageUrl, pageUrl)
|
||||
|
||||
def getLatestUrl(self):
|
||||
"""Get starter URL from where to scrape comic strips."""
|
||||
return self.starter()
|
||||
|
@ -227,7 +203,7 @@ def find_scraperclasses(comic, multiple_allowed=False):
|
|||
candidates = []
|
||||
cname = comic.lower()
|
||||
for scraperclass in get_scraperclasses():
|
||||
lname = scraperclass.get_name().lower()
|
||||
lname = scraperclass.getName().lower()
|
||||
if lname == cname:
|
||||
# perfect match
|
||||
if not multiple_allowed:
|
||||
|
@ -237,7 +213,7 @@ def find_scraperclasses(comic, multiple_allowed=False):
|
|||
elif cname in lname:
|
||||
candidates.append(scraperclass)
|
||||
if len(candidates) > 1 and not multiple_allowed:
|
||||
comics = ", ".join(x.get_name() for x in candidates)
|
||||
comics = ", ".join(x.getName() for x in candidates)
|
||||
raise ValueError('multiple comics found: %s' % comics)
|
||||
elif not candidates:
|
||||
raise ValueError('comic %r not found' % comic)
|
||||
|
@ -266,10 +242,10 @@ def check_scrapers():
|
|||
"""Check for duplicate scraper class names."""
|
||||
d = {}
|
||||
for scraperclass in _scraperclasses:
|
||||
name = scraperclass.get_name().lower()
|
||||
name = scraperclass.getName().lower()
|
||||
if name in d:
|
||||
name1 = scraperclass.get_name()
|
||||
name2 = d[name].get_name()
|
||||
name1 = scraperclass.getName()
|
||||
name2 = d[name].getName()
|
||||
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
|
||||
d[name] = scraperclass
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@ def has_comic(name):
|
|||
("SmackJeeves/%s" % name).lower(),
|
||||
]
|
||||
for scraperclass in get_scraperclasses():
|
||||
lname = scraperclass.get_name().lower()
|
||||
lname = scraperclass.getName().lower()
|
||||
if lname in names:
|
||||
return True
|
||||
return False
|
||||
|
|
|
@ -275,7 +275,7 @@ def has_comic(name):
|
|||
("Arcamax/%s" % name).lower(),
|
||||
]
|
||||
for scraperclass in get_scraperclasses():
|
||||
lname = scraperclass.get_name().lower()
|
||||
lname = scraperclass.getName().lower()
|
||||
if lname in names:
|
||||
return True
|
||||
return False
|
||||
|
|
|
@ -83,7 +83,7 @@ def has_creators_comic(name):
|
|||
"""Test if comic name already exists."""
|
||||
cname = "Creators/%s" % name
|
||||
for scraperclass in get_scraperclasses():
|
||||
lname = scraperclass.get_name().lower()
|
||||
lname = scraperclass.getName().lower()
|
||||
if lname == cname.lower():
|
||||
return True
|
||||
return False
|
||||
|
|
|
@ -407,7 +407,7 @@ def has_comic(name):
|
|||
cname = ("Creators/%s" % name).lower()
|
||||
gname = ("GoComics/%s" % name).lower()
|
||||
for scraperclass in get_scraperclasses():
|
||||
lname = scraperclass.get_name().lower()
|
||||
lname = scraperclass.getName().lower()
|
||||
if lname == cname or lname == gname:
|
||||
return True
|
||||
return False
|
||||
|
|
|
@ -291,7 +291,7 @@ def has_comic(name):
|
|||
"""Check if comic name already exists."""
|
||||
cname = name.lower()
|
||||
for scraperclass in get_scraperclasses():
|
||||
lname = scraperclass.get_name().lower()
|
||||
lname = scraperclass.getName().lower()
|
||||
if lname == cname:
|
||||
return True
|
||||
return False
|
||||
|
|
4
setup.py
4
setup.py
|
@ -1,6 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: iso-8859-1 -*-
|
||||
# Dosage, the webcomic downloader
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||
from __future__ import print_function
|
||||
|
@ -394,7 +393,8 @@ class MyRegister (register, object):
|
|||
args = dict(
|
||||
name = AppName,
|
||||
version = AppVersion,
|
||||
description = 'a commandline webcomic downloader and archiver',
|
||||
description = 'a commandline comic downloader and archiver',
|
||||
keywords = 'comic,webcomic,downloader,archiver',
|
||||
author = 'Tristan Seligmann, Jonathan Jacobs, Bastian Kleineidam',
|
||||
author_email = 'bastian.kleineidam@web.de',
|
||||
maintainer = 'Bastian Kleineidam',
|
||||
|
|
|
@ -8,7 +8,7 @@ class TestComicNames(TestCase):
|
|||
|
||||
def test_names(self):
|
||||
for scraperclass in scraper.get_scraperclasses():
|
||||
name = scraperclass.get_name()
|
||||
name = scraperclass.getName()
|
||||
self.assertTrue(name.count('/') <= 1, name)
|
||||
if '/' in name:
|
||||
comicname = name.split('/')[1]
|
||||
|
|
|
@ -34,7 +34,7 @@ class _ComicTester(TestCase):
|
|||
scraperclass=None
|
||||
|
||||
def setUp(self):
|
||||
self.name = self.scraperclass.get_name()
|
||||
self.name = self.scraperclass.getName()
|
||||
self.url = self.scraperclass.starter()
|
||||
# create a temporary directory for images
|
||||
self.tmpdir = tempfile.mkdtemp()
|
||||
|
@ -64,7 +64,7 @@ class _ComicTester(TestCase):
|
|||
def _test_comic(self, scraperobj):
|
||||
num = 0
|
||||
max_strips = 5
|
||||
for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
|
||||
for strip in scraperobj.getStrips(max_strips):
|
||||
images = []
|
||||
for image in strip.getImages():
|
||||
images.append(image.url)
|
||||
|
@ -122,7 +122,7 @@ def generate_comic_testers():
|
|||
g = globals()
|
||||
if "TRAVIS" in os.environ:
|
||||
# Get limited number of scraper tests on Travis builds.
|
||||
max_scrapers = 1500
|
||||
max_scrapers = 500
|
||||
scraperclasses = islice(scraper.get_scraperclasses(), 0, max_scrapers)
|
||||
else:
|
||||
scraperclasses = scraper.get_scraperclasses()
|
||||
|
|
|
@ -52,6 +52,11 @@ class TestDosage (unittest.TestCase):
|
|||
self.assertRaises(OSError, run_with_options, [])
|
||||
self.assertRaises(OSError, run_with_options, ['--imadoofus'])
|
||||
|
||||
def test_fetch(self):
|
||||
def test_fetch_html(self):
|
||||
run_with_options(["-n", "2", "-b", self.tmpdir, "-o", "html", "calvinandhobbes"])
|
||||
|
||||
def test_fetch_rss(self):
|
||||
run_with_options(["--numstrips", "2", "--baseurl", "bla", "--basepath", self.tmpdir, "--output", "rss", "--adult", "sexyloser"])
|
||||
|
||||
def test_fetch_indexed(self):
|
||||
run_with_options(["-n", "2", "-b", self.tmpdir, "calvinandhobbes:2012/02/02"])
|
||||
|
|
Loading…
Reference in a new issue