Added some comic strips and cleanup the scraper code.
This commit is contained in:
parent
6091138481
commit
bae2a96d8b
31 changed files with 296 additions and 128 deletions
|
@ -1,14 +1,14 @@
|
||||||
Dosage
|
Dosage
|
||||||
=======
|
=======
|
||||||
|
|
||||||
Dosage is a commandline webcomic downloader and archiver.
|
Dosage is a commandline comic downloader and archiver.
|
||||||
|
|
||||||
Introduction
|
Introduction
|
||||||
-------------
|
-------------
|
||||||
Dosage is designed to keep a local copy of specific webcomics
|
Dosage is designed to keep a local copy of specific webcomics
|
||||||
and other picture-based content such as Picture of the Day sites.
|
and other picture-based content such as Picture of the Day sites.
|
||||||
With the dosage commandline script you can get the latest strip of
|
With the dosage commandline script you can get the latest strip of
|
||||||
webcomic, or catch-up to the last strip downloaded, or download a
|
a webcomic, or catch-up to the last strip downloaded, or download a
|
||||||
strip for a particular date/index (if the webcomic's site layout
|
strip for a particular date/index (if the webcomic's site layout
|
||||||
makes this possible).
|
makes this possible).
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ Technical Description
|
||||||
Dosage is written in Python and relies on regular expressions to
|
Dosage is written in Python and relies on regular expressions to
|
||||||
do most of the grunt work.
|
do most of the grunt work.
|
||||||
|
|
||||||
For each webcomic Dosage has a plugin module, found in the "plugins"
|
For each comic Dosage has a plugin module, found in the "plugins"
|
||||||
subdirectory of the dosagelib directory. Each module is a subclass of
|
subdirectory of the dosagelib directory. Each module is a subclass of
|
||||||
the _BasicComic class and specifies where to download its comic images.
|
the _BasicComic class and specifies where to download its comic images.
|
||||||
Some comic syndicates (GoComics for example) have a standard layout for all
|
Some comic syndicates (GoComics for example) have a standard layout for all
|
||||||
|
@ -100,7 +100,7 @@ instances from a given list of comic strips.
|
||||||
|
|
||||||
Extending Dosage
|
Extending Dosage
|
||||||
-----------------
|
-----------------
|
||||||
In order to add a new webcomic, a new module class has to be created in
|
In order to add a new comic, a new module class has to be created in
|
||||||
one of the *.py files in the dosagelib/plugins subdirectory.
|
one of the *.py files in the dosagelib/plugins subdirectory.
|
||||||
Look at the existing module classes for examples.
|
Look at the existing module classes for examples.
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
Dosage 1.13 (released xx.xx.2013)
|
Dosage 1.13 (released xx.xx.2013)
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- comics: Added comic strips AxeCop, Bearmageddon, DeadWinter,
|
||||||
|
HarkAVagrant, IAmArg, LoadingArtist, Nnewts, PHDComics, PokeyThePenguin,
|
||||||
|
SnowFlame and WorldOfMrToast.
|
||||||
|
|
||||||
Fixes:
|
Fixes:
|
||||||
- cmdline: Catch error when piping output to another
|
- cmdline: Catch error when piping output to another
|
||||||
program or file under Windows.
|
program or file under Windows.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
.TH DOSAGE 1
|
.TH DOSAGE 1
|
||||||
.SH NAME
|
.SH NAME
|
||||||
dosage \- a commandline webcomic downloader and archiver
|
dosage \- a commandline comic downloader and archiver
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
\fBdosage\fP [\fIoptions\fP] \fImodule\fP...
|
\fBdosage\fP [\fIoptions\fP] \fImodule\fP...
|
||||||
.SH DESCRIPTION
|
.SH DESCRIPTION
|
||||||
|
@ -128,7 +128,7 @@ Retrieve the Penny Arcade strip for a given index:
|
||||||
Retrieve Calvin and Hobbes strips from a given index going backwards to
|
Retrieve Calvin and Hobbes strips from a given index going backwards to
|
||||||
the beginning.
|
the beginning.
|
||||||
.RS
|
.RS
|
||||||
.B dosage \-a calvinandhobbes:20120722
|
.B dosage \-a calvinandhobbes:2012/07/22
|
||||||
.RE
|
.RE
|
||||||
.PP
|
.PP
|
||||||
On Unix, \fBxargs(1)\fP can download several comic strips in parallel,
|
On Unix, \fBxargs(1)\fP can download several comic strips in parallel,
|
||||||
|
|
|
@ -9,7 +9,7 @@ Section: User Commands (1)<BR><A HREF="#index">Index</A>
|
||||||
<A NAME="lbAB"> </A>
|
<A NAME="lbAB"> </A>
|
||||||
<H2>NAME</H2>
|
<H2>NAME</H2>
|
||||||
|
|
||||||
dosage - a commandline webcomic downloader and archiver
|
dosage - a commandline comic downloader and archiver
|
||||||
<A NAME="lbAC"> </A>
|
<A NAME="lbAC"> </A>
|
||||||
<H2>SYNOPSIS</H2>
|
<H2>SYNOPSIS</H2>
|
||||||
|
|
||||||
|
@ -174,7 +174,7 @@ Retrieve the Penny Arcade strip for a given index:
|
||||||
Retrieve Calvin and Hobbes strips from a given index going backwards to
|
Retrieve Calvin and Hobbes strips from a given index going backwards to
|
||||||
the beginning.
|
the beginning.
|
||||||
<DL COMPACT><DT><DD>
|
<DL COMPACT><DT><DD>
|
||||||
<B>dosage -a calvinandhobbes:20120722</B>
|
<B>dosage -a calvinandhobbes:2012/07/22</B>
|
||||||
|
|
||||||
</DL>
|
</DL>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ DOSAGE(1) DOSAGE(1)
|
||||||
|
|
||||||
|
|
||||||
NAME
|
NAME
|
||||||
dosage - a commandline webcomic downloader and archiver
|
dosage - a commandline comic downloader and archiver
|
||||||
|
|
||||||
SYNOPSIS
|
SYNOPSIS
|
||||||
dosage [options] module...
|
dosage [options] module...
|
||||||
|
@ -116,7 +116,7 @@ EXAMPLES
|
||||||
|
|
||||||
Retrieve Calvin and Hobbes strips from a given index going
|
Retrieve Calvin and Hobbes strips from a given index going
|
||||||
backwards to the beginning.
|
backwards to the beginning.
|
||||||
dosage -a calvinandhobbes:20120722
|
dosage -a calvinandhobbes:2012/07/22
|
||||||
|
|
||||||
On Unix, xargs(1) can download several comic strips in paral‐
|
On Unix, xargs(1) can download several comic strips in paral‐
|
||||||
lel, for example using up to 4 processes:
|
lel, for example using up to 4 processes:
|
||||||
|
|
24
dosage
24
dosage
|
@ -1,6 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: iso-8859-1 -*-
|
||||||
# Dosage, the webcomic downloader
|
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
@ -56,7 +55,7 @@ def setupOptions():
|
||||||
@rtype argparse.ArgumentParser
|
@rtype argparse.ArgumentParser
|
||||||
"""
|
"""
|
||||||
kwargs = dict(
|
kwargs = dict(
|
||||||
description = "A commandline webcomic downloader and archiver.",
|
description = "A commandline comic downloader and archiver.",
|
||||||
epilog = Examples,
|
epilog = Examples,
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
)
|
)
|
||||||
|
@ -131,7 +130,7 @@ def displayHelp(comics):
|
||||||
|
|
||||||
def displayComicHelp(scraperobj):
|
def displayComicHelp(scraperobj):
|
||||||
"""Print description and help for a comic."""
|
"""Print description and help for a comic."""
|
||||||
out.context = scraperobj.get_name()
|
out.context = scraperobj.getName()
|
||||||
try:
|
try:
|
||||||
if scraperobj.description:
|
if scraperobj.description:
|
||||||
out.info("Description: " + scraperobj.description)
|
out.info("Description: " + scraperobj.description)
|
||||||
|
@ -163,15 +162,16 @@ def getComics(options):
|
||||||
def getStrips(scraperobj, options):
|
def getStrips(scraperobj, options):
|
||||||
"""Get all strips from a scraper."""
|
"""Get all strips from a scraper."""
|
||||||
errors = 0
|
errors = 0
|
||||||
out.context = scraperobj.get_name()
|
out.context = scraperobj.getName()
|
||||||
if options.all:
|
if options.all:
|
||||||
strips = scraperobj.getAllStrips()
|
numstrips = None
|
||||||
elif options.numstrips:
|
elif options.numstrips:
|
||||||
strips = scraperobj.getAllStrips(options.numstrips)
|
numstrips = options.numstrips
|
||||||
else:
|
else:
|
||||||
strips = scraperobj.getCurrentStrips()
|
# get current strip
|
||||||
|
numstrips = 1
|
||||||
try:
|
try:
|
||||||
for strip in strips:
|
for strip in scraperobj.getStrips(numstrips):
|
||||||
_errors, skipped = saveComicStrip(strip, options.basepath)
|
_errors, skipped = saveComicStrip(strip, options.basepath)
|
||||||
errors += _errors
|
errors += _errors
|
||||||
if skipped and options.cont:
|
if skipped and options.cont:
|
||||||
|
@ -206,7 +206,7 @@ def doList(columnList=True):
|
||||||
"""List available comics."""
|
"""List available comics."""
|
||||||
out.info('Available comic scrapers:')
|
out.info('Available comic scrapers:')
|
||||||
out.info('Comics marked with [A] require age confirmation with the --adult option.')
|
out.info('Comics marked with [A] require age confirmation with the --adult option.')
|
||||||
scrapers = sorted(getScrapers(['@@']), key=lambda s: s.get_name())
|
scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName())
|
||||||
try:
|
try:
|
||||||
if columnList:
|
if columnList:
|
||||||
num = doColumnList(scrapers)
|
num = doColumnList(scrapers)
|
||||||
|
@ -243,7 +243,7 @@ def doColumnList(scrapers):
|
||||||
def getScraperName(scraperobj, limit=None):
|
def getScraperName(scraperobj, limit=None):
|
||||||
"""Get comic scraper name."""
|
"""Get comic scraper name."""
|
||||||
suffix = " [A]" if scraperobj.adult else ""
|
suffix = " [A]" if scraperobj.adult else ""
|
||||||
name = scraperobj.get_name()
|
name = scraperobj.getName()
|
||||||
if limit is not None:
|
if limit is not None:
|
||||||
name = strlimit(name, limit)
|
name = strlimit(name, limit)
|
||||||
return name + suffix
|
return name + suffix
|
||||||
|
@ -259,7 +259,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
|
||||||
if not adult and scraperclass.adult:
|
if not adult and scraperclass.adult:
|
||||||
warn_adult(scraperclass)
|
warn_adult(scraperclass)
|
||||||
continue
|
continue
|
||||||
dirname = getDirname(scraperclass.get_name())
|
dirname = getDirname(scraperclass.getName())
|
||||||
if os.path.isdir(os.path.join(basepath, dirname)):
|
if os.path.isdir(os.path.join(basepath, dirname)):
|
||||||
yield scraperclass()
|
yield scraperclass()
|
||||||
elif '@@' in comics:
|
elif '@@' in comics:
|
||||||
|
@ -293,7 +293,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
|
||||||
|
|
||||||
def warn_adult(scraperclass):
|
def warn_adult(scraperclass):
|
||||||
"""Print warning about adult content."""
|
"""Print warning about adult content."""
|
||||||
out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.get_name())
|
out.warn("skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: iso-8859-1 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012 Bastian Kleineidam
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||||
"""
|
"""
|
||||||
Automated webcomic downloader. Dosage traverses webcomic websites in
|
Automated comic downloader. Dosage traverses comic websites in
|
||||||
order to download each strip of the comic. The intended use is for
|
order to download each strip of the comic. The intended use is for
|
||||||
mirroring the strips locally for ease of viewing; redistribution of the
|
mirroring the strips locally for ease of viewing; redistribution of the
|
||||||
downloaded strips may violate copyright, and is not advisable unless you
|
downloaded strips may violate copyright, and is not advisable unless you
|
||||||
|
@ -11,7 +11,7 @@ your intentions, and received permission to distribute.
|
||||||
|
|
||||||
The primary dosage interface is currently the 'mainline' script, which
|
The primary dosage interface is currently the 'mainline' script, which
|
||||||
is just a thin wrapper that invokes L{dosage.mainline}. Comic modules
|
is just a thin wrapper that invokes L{dosage.mainline}. Comic modules
|
||||||
for each webcomic are located in L{dosage.modules}; most of these make
|
for each comic are located in L{dosage.modules}; most of these make
|
||||||
use of the helper base classes and mixins in L{dosage.modules.helpers},
|
use of the helper base classes and mixins in L{dosage.modules.helpers},
|
||||||
thus making their individual implementations trivial.
|
thus making their individual implementations trivial.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -88,6 +88,7 @@ class ComicImage(object):
|
||||||
|
|
||||||
def save(self, basepath):
|
def save(self, basepath):
|
||||||
"""Save comic URL to filename on disk."""
|
"""Save comic URL to filename on disk."""
|
||||||
|
out.info("Get image URL %s" % self.url, level=1)
|
||||||
self.connect()
|
self.connect()
|
||||||
filename = "%s%s" % (self.filename, self.ext)
|
filename = "%s%s" % (self.filename, self.ext)
|
||||||
comicSize = self.contentLength
|
comicSize = self.contentLength
|
||||||
|
@ -96,6 +97,7 @@ class ComicImage(object):
|
||||||
os.makedirs(comicDir)
|
os.makedirs(comicDir)
|
||||||
|
|
||||||
fn = os.path.join(comicDir, filename)
|
fn = os.path.join(comicDir, filename)
|
||||||
|
# compare with >= since comicSize could be the compressed size
|
||||||
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
|
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
|
||||||
self.touch(fn)
|
self.touch(fn)
|
||||||
out.info('Skipping existing file "%s".' % fn)
|
out.info('Skipping existing file "%s".' % fn)
|
||||||
|
|
|
@ -159,14 +159,17 @@ class AstronomyPOTD(_BasicScraper):
|
||||||
stripUrl = 'http://antwrp.gsfc.nasa.gov/apod/ap%s.html'
|
stripUrl = 'http://antwrp.gsfc.nasa.gov/apod/ap%s.html'
|
||||||
imageSearch = compile(tagre("a", "href", r'(image/\d{4}/[^"]+)'))
|
imageSearch = compile(tagre("a", "href", r'(image/\d{4}/[^"]+)'))
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
noImageUrls = set([
|
|
||||||
'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video
|
|
||||||
'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video
|
|
||||||
'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video
|
|
||||||
])
|
|
||||||
prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "<</a>")
|
prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "<</a>")
|
||||||
help = 'Index format: yymmdd'
|
help = 'Index format: yymmdd'
|
||||||
|
|
||||||
|
def shouldSkipUrl(self, url):
|
||||||
|
"""Skip pages without images."""
|
||||||
|
return url in (
|
||||||
|
'http://antwrp.gsfc.nasa.gov/apod/ap130217.html', # video
|
||||||
|
'http://antwrp.gsfc.nasa.gov/apod/ap130218.html', # video
|
||||||
|
'http://antwrp.gsfc.nasa.gov/apod/ap130226.html', # video
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def namer(cls, imageUrl, pageUrl):
|
def namer(cls, imageUrl, pageUrl):
|
||||||
return '%s-%s' % (pageUrl.split('/')[-1].split('.')[0][2:],
|
return '%s-%s' % (pageUrl.split('/')[-1].split('.')[0][2:],
|
||||||
|
@ -269,3 +272,14 @@ class Annyseed(_BasicScraper):
|
||||||
imageSearch = compile(tagre("img", "src", r'(Annyseed[^"]+)'))
|
imageSearch = compile(tagre("img", "src", r'(Annyseed[^"]+)'))
|
||||||
prevSearch = compile(r'<a href="(http://www\.colourofivy\.com/[^"]+)"><img src="Last.gif"')
|
prevSearch = compile(r'<a href="(http://www\.colourofivy\.com/[^"]+)"><img src="Last.gif"')
|
||||||
help = 'Index format: nnn'
|
help = 'Index format: nnn'
|
||||||
|
|
||||||
|
|
||||||
|
class AxeCop(_BasicScraper):
|
||||||
|
url = 'http://axecop.com/'
|
||||||
|
starter = indirectStarter(url, compile(tagre("a", "href", r'(http://axecop\.com/index\.php/acepisodes/read/episode_\d+/)')))
|
||||||
|
stripUrl = url + 'index.php/acepisodes/read/episode_%s/'
|
||||||
|
firstStripUrl = stripUrl % '0'
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(http://axecop\.com/images/uploads/axecop[^"]+)'))
|
||||||
|
prevSearch = compile(tagre("a", "href", r'(http://axecop\.com/index\.php/acepisodes/read/episode_\d+/)') +
|
||||||
|
tagre("img", "src", r'http://axecop\.com/acimages/buttons/page_left\.png'))
|
||||||
|
help = 'Index format: number'
|
||||||
|
|
|
@ -25,6 +25,15 @@ class Bardsworth(_BasicScraper):
|
||||||
help = 'Index format: nnn'
|
help = 'Index format: nnn'
|
||||||
|
|
||||||
|
|
||||||
|
class Bearmageddon(_BasicScraper):
|
||||||
|
url = 'http://bearmageddon.com/'
|
||||||
|
stripUrl = url + '%s/'
|
||||||
|
firstStripUrl = stripUrl % '2011/08/01/page-1'
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(http://bearmageddon\.com/comics/[^"]+)'))
|
||||||
|
prevSearch = compile(tagre("a", "href", r'(http://bearmageddon\.com/\d+/\d+/\d+/[^"]+)', after='navi-prev'))
|
||||||
|
help = 'Index format: yyyy/mm/dd/stripname'
|
||||||
|
|
||||||
|
|
||||||
class BetterDays(_BasicScraper):
|
class BetterDays(_BasicScraper):
|
||||||
url = 'http://jaynaylor.com/betterdays/'
|
url = 'http://jaynaylor.com/betterdays/'
|
||||||
stripUrl = url + 'archives/%s.html'
|
stripUrl = url + 'archives/%s.html'
|
||||||
|
@ -119,6 +128,16 @@ class BrentalFlossGuest(BrentalFloss):
|
||||||
stripUrl = url + '?id=%s'
|
stripUrl = url + '?id=%s'
|
||||||
|
|
||||||
|
|
||||||
|
# XXX disallowed by robots.txt
|
||||||
|
class _BringBackRoomies(_BasicScraper):
|
||||||
|
url = "http://www.bringbackroomies.com/"
|
||||||
|
stripUrl = url + "comic/%s"
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(http://www\.bringbackroomies\.com/wp-content/uploads/\d+/\d+/[^"]+)'))
|
||||||
|
prevSearch = compile(tagre("span", "class", "mininav-prev") +
|
||||||
|
tagre("a", "href", r'(http://www\.bringbackroomies\.com/comic/[^"]+)'))
|
||||||
|
help = 'Index format: stripname'
|
||||||
|
|
||||||
|
|
||||||
class Brink(_BasicScraper):
|
class Brink(_BasicScraper):
|
||||||
url = 'http://paperfangs.com/brink/'
|
url = 'http://paperfangs.com/brink/'
|
||||||
stripUrl = url + '?p=%s'
|
stripUrl = url + '?p=%s'
|
||||||
|
|
|
@ -209,9 +209,12 @@ class CyanideAndHappiness(_BasicScraper):
|
||||||
stripUrl = url + '%s/'
|
stripUrl = url + '%s/'
|
||||||
imageSearch = compile(tagre("img", "src", r'(http://(?:www\.)?explosm\.net/db/files/[^"]+)', before="a daily webcomic"))
|
imageSearch = compile(tagre("img", "src", r'(http://(?:www\.)?explosm\.net/db/files/[^"]+)', before="a daily webcomic"))
|
||||||
prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', before="prev"))
|
prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', before="prev"))
|
||||||
noImageUrls = set(["http://www.explosm.net/comics/3082/"])
|
|
||||||
help = 'Index format: n (unpadded)'
|
help = 'Index format: n (unpadded)'
|
||||||
|
|
||||||
|
def shouldSkipUrl(self, url):
|
||||||
|
"""Skip pages without images."""
|
||||||
|
return url == "http://www.explosm.net/comics/3082/"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def namer(cls, imageUrl, pageUrl):
|
def namer(cls, imageUrl, pageUrl):
|
||||||
imgname = imageUrl.split('/')[-1]
|
imgname = imageUrl.split('/')[-1]
|
||||||
|
|
|
@ -54,6 +54,15 @@ class DarkWings(_BasicScraper):
|
||||||
help = 'Index format: yyyy/mm/dd/page-nn-mm'
|
help = 'Index format: yyyy/mm/dd/page-nn-mm'
|
||||||
|
|
||||||
|
|
||||||
|
class DeadWinter(_BasicScraper):
|
||||||
|
url = 'http://deadwinter.cc/'
|
||||||
|
stripUrl = url + 'page/%s'
|
||||||
|
firstStripUrl = stripUrl % '1'
|
||||||
|
imageSearch = compile(tagre("img", "src", r"(/static/page/strip/\d+[^']+)", quote="'"))
|
||||||
|
prevSearch = compile(tagre("a", "href", r'(/page/\d+)') + "Previous")
|
||||||
|
help = 'Index format: number'
|
||||||
|
|
||||||
|
|
||||||
class DeathToTheExtremist(_BasicScraper):
|
class DeathToTheExtremist(_BasicScraper):
|
||||||
url = 'http://www.dtecomic.com/'
|
url = 'http://www.dtecomic.com/'
|
||||||
stripUrl = url + '?n=%s'
|
stripUrl = url + '?n=%s'
|
||||||
|
|
|
@ -44,8 +44,8 @@ class FilibusterCartoons(_BasicScraper):
|
||||||
|
|
||||||
class FirstWorldProblems(_BasicScraper):
|
class FirstWorldProblems(_BasicScraper):
|
||||||
url = 'http://bradcolbow.com/archive/C5/'
|
url = 'http://bradcolbow.com/archive/C5/'
|
||||||
stripUrl = url + '%s'
|
stripUrl = url + '%s/'
|
||||||
firstStripUrl = 'http://bradcolbow.com/archive/C5/P10/'
|
firstStripUrl = stripUrl % 'P10'
|
||||||
imageSearch = compile(tagre("img", "src", r'(http://(?:fwpcomics\.s3\.amazonaws\.com|s3\.amazonaws\.com/fwpcomics)/s1-[^"]+)'))
|
imageSearch = compile(tagre("img", "src", r'(http://(?:fwpcomics\.s3\.amazonaws\.com|s3\.amazonaws\.com/fwpcomics)/s1-[^"]+)'))
|
||||||
prevSearch = compile(tagre("a", "href", r'(http://bradcolbow\.com/archive/C5/[^"]+)', before="prev"))
|
prevSearch = compile(tagre("a", "href", r'(http://bradcolbow\.com/archive/C5/[^"]+)', before="prev"))
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
|
@ -126,9 +126,9 @@ class Fallen(_BasicScraper):
|
||||||
part = pageUrl.split('-')[-1].split('.')[0]
|
part = pageUrl.split('-')[-1].split('.')[0]
|
||||||
return '%s-%s' % (part, num)
|
return '%s-%s' % (part, num)
|
||||||
|
|
||||||
def setStrip(self, index):
|
def getIndexStripUrl(self, index):
|
||||||
index, part = index.split('-')
|
index, part = index.split('-')
|
||||||
self.currentUrl = self.stripUrl % (part, index, part)
|
return self.stripUrl % (part, index, part)
|
||||||
|
|
||||||
|
|
||||||
class FredoAndPidjin(_BasicScraper):
|
class FredoAndPidjin(_BasicScraper):
|
||||||
|
|
|
@ -3,6 +3,26 @@
|
||||||
from re import compile
|
from re import compile
|
||||||
from ..scraper import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
|
from ..helpers import bounceStarter
|
||||||
|
|
||||||
|
|
||||||
|
class HarkAVagrant(_BasicScraper):
|
||||||
|
url = 'http://www.harkavagrant.com/'
|
||||||
|
starter = bounceStarter(url,
|
||||||
|
compile(tagre("a", "href", r'(http://www\.harkavagrant\.com/index\.php\?id=\d+)') +
|
||||||
|
tagre("img", "src", "buttonnext.png")))
|
||||||
|
stripUrl = url + 'index.php?id=%s'
|
||||||
|
firstStripUrl = stripUrl % '1'
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(http://www.harkavagrant.com/[^"]+)', after='BORDER'))
|
||||||
|
prevSearch = compile(tagre("a", "href", r'(http://www\.harkavagrant\.com/index\.php\?id=\d+)') +
|
||||||
|
tagre("img", "src", "buttonprevious.png"))
|
||||||
|
help = 'Index format: number'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
filename = imageUrl.rsplit('/', 1)[1]
|
||||||
|
num = pageUrl.rsplit('=', 1)[1]
|
||||||
|
return '%s-%s' % (num, filename)
|
||||||
|
|
||||||
|
|
||||||
class HijinksEnsue(_BasicScraper):
|
class HijinksEnsue(_BasicScraper):
|
||||||
|
|
|
@ -7,6 +7,15 @@ from ..scraper import _BasicScraper
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
|
|
||||||
|
|
||||||
|
class IAmArg(_BasicScraper):
|
||||||
|
url = 'http://iamarg.com/'
|
||||||
|
stripUrl = url + '%s/'
|
||||||
|
firstStripUrl = stripUrl % '2011/05/08/05082011'
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(http://iamarg\.com/comics/\d+-\d+-\d+[^"]+)'))
|
||||||
|
prevSearch = compile(tagre("a", "href", r'(http://iamarg\.com/\d+/\d+/\d+/[^"]+)', after="prev"))
|
||||||
|
help = 'Index format: yyyy/mm/dd/stripname'
|
||||||
|
|
||||||
|
|
||||||
class IanJay(_BasicScraper):
|
class IanJay(_BasicScraper):
|
||||||
url = 'http://ianjay.net/'
|
url = 'http://ianjay.net/'
|
||||||
stripUrl = url + '?p=%s'
|
stripUrl = url + '?p=%s'
|
||||||
|
|
|
@ -37,8 +37,8 @@ class KevinAndKell(_BasicScraper):
|
||||||
prevSearch = compile(r'<a.+?href="(/?(\.\./)?\d+/kk\d+\.html)"[^>]*><span>Previous Strip', IGNORECASE)
|
prevSearch = compile(r'<a.+?href="(/?(\.\./)?\d+/kk\d+\.html)"[^>]*><span>Previous Strip', IGNORECASE)
|
||||||
help = 'Index format: yyyy-mm-dd'
|
help = 'Index format: yyyy-mm-dd'
|
||||||
|
|
||||||
def setStrip(self, index):
|
def getIndexStripUrl(self, index):
|
||||||
self.currentUrl = self.stripUrl % tuple(map(int, index.split('-')))
|
return self.stripUrl % tuple(map(int, index.split('-')))
|
||||||
|
|
||||||
|
|
||||||
class KhaosKomix(_BasicScraper):
|
class KhaosKomix(_BasicScraper):
|
||||||
|
|
|
@ -24,6 +24,15 @@ class Lint(_BasicScraper):
|
||||||
help = 'Index format: yyyy/mm/dd/num-name'
|
help = 'Index format: yyyy/mm/dd/num-name'
|
||||||
|
|
||||||
|
|
||||||
|
class LoadingArtist(_BasicScraper):
|
||||||
|
url = 'http://www.loadingartist.com/'
|
||||||
|
stripUrl = url + '%s/'
|
||||||
|
firstStripUrl = stripUrl % '2011/01/04/born'
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(http://www\.loadingartist\.com/comics/[^"]+)'))
|
||||||
|
prevSearch = compile(tagre("a", "href", r'(http://www\.loadingartist\.com/\d+/\d+/\d+/[^"]+/)', after="prev"))
|
||||||
|
help = 'Index format: yyyy/mm/dd/stripname'
|
||||||
|
|
||||||
|
|
||||||
class LookingForGroup(_BasicScraper):
|
class LookingForGroup(_BasicScraper):
|
||||||
url = 'http://www.lfgcomic.com/'
|
url = 'http://www.lfgcomic.com/'
|
||||||
stripUrl = url + 'page/%s/'
|
stripUrl = url + 'page/%s/'
|
||||||
|
@ -33,6 +42,7 @@ class LookingForGroup(_BasicScraper):
|
||||||
nameSearch = compile(r'/page/(\d+)/')
|
nameSearch = compile(r'/page/(\d+)/')
|
||||||
help = 'Index format: nnn'
|
help = 'Index format: nnn'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
return self.nameSearch.search(pageUrl).group(1)
|
return self.nameSearch.search(pageUrl).group(1)
|
||||||
|
|
||||||
|
|
|
@ -69,6 +69,15 @@ class Nicky510(_BasicScraper):
|
||||||
help = 'Index format: stripname'
|
help = 'Index format: stripname'
|
||||||
|
|
||||||
|
|
||||||
|
class Nnewts(_BasicScraper):
|
||||||
|
url = 'http://nnewts.com/'
|
||||||
|
stripUrl = url + '%s/'
|
||||||
|
firstStripUrl = stripUrl % 'nnewts-page-1'
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(http://nnewts\.com/newty/comics/[^"]+)'))
|
||||||
|
prevSearch = compile(tagre("a", "href", r'(http://nnewts\.com/(?:nnewts-)?page-\d+/)', after="navi-prev"))
|
||||||
|
help = 'Index format: page-number'
|
||||||
|
|
||||||
|
|
||||||
class NoNeedForBushido(_BasicScraper):
|
class NoNeedForBushido(_BasicScraper):
|
||||||
url = 'http://noneedforbushido.com/latest/'
|
url = 'http://noneedforbushido.com/latest/'
|
||||||
stripUrl = 'http://noneedforbushido.com/%s/'
|
stripUrl = 'http://noneedforbushido.com/%s/'
|
||||||
|
|
|
@ -83,6 +83,17 @@ class PeppermintSaga(_BasicScraper):
|
||||||
help = 'Index format: number'
|
help = 'Index format: number'
|
||||||
|
|
||||||
|
|
||||||
|
class PHDComics(_BasicScraper):
|
||||||
|
baseurl = 'http://phdcomics.com/'
|
||||||
|
url = baseurl + 'comics.php'
|
||||||
|
stripUrl = baseurl + 'comics/archive.php?comicid=%s'
|
||||||
|
firstStripUrl = stripUrl % '1'
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd[^ ]+)', quote=""))
|
||||||
|
prevSearch = compile(tagre("a", "href", r'((?:comics/)?archive\.php\?comicid=\d+)', quote="") +
|
||||||
|
tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
|
||||||
|
help = 'Index format: number'
|
||||||
|
|
||||||
|
|
||||||
class PicPakDog(_BasicScraper):
|
class PicPakDog(_BasicScraper):
|
||||||
url = 'http://www.picpak.net/'
|
url = 'http://www.picpak.net/'
|
||||||
stripUrl = url + 'comic/%s/'
|
stripUrl = url + 'comic/%s/'
|
||||||
|
@ -117,6 +128,23 @@ class Pimpette(_BasicScraper):
|
||||||
help = 'Index format: yyyymmdd'
|
help = 'Index format: yyyymmdd'
|
||||||
|
|
||||||
|
|
||||||
|
class PokeyThePenguin(_BasicScraper):
|
||||||
|
baseurl = 'http://www.yellow5.com/pokey/archive/'
|
||||||
|
url = baseurl + 'index558.html'
|
||||||
|
stripUrl = baseurl + 'index%s.html'
|
||||||
|
firstStripUrl = stripUrl % '1'
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(pokey\d+[^"]+)'))
|
||||||
|
multipleImagesPerStrip = True
|
||||||
|
help = 'Index format: number'
|
||||||
|
|
||||||
|
def getPrevUrl(self, url, data, baseUrl):
|
||||||
|
"""Decrease index.html number."""
|
||||||
|
mo = compile(r"index(\d+)\.html").search(url)
|
||||||
|
num = int(mo.group(1)) - 1
|
||||||
|
prefix = url.rsplit('/', 1)[0]
|
||||||
|
return "%s/index%d.html" % (prefix, num)
|
||||||
|
|
||||||
|
|
||||||
class Precocious(_BasicScraper):
|
class Precocious(_BasicScraper):
|
||||||
url = 'http://www.precociouscomic.com/'
|
url = 'http://www.precociouscomic.com/'
|
||||||
starter = indirectStarter(url,
|
starter = indirectStarter(url,
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
from re import compile, MULTILINE, IGNORECASE, sub
|
from re import compile, MULTILINE, IGNORECASE, sub
|
||||||
from os.path import splitext
|
from os.path import splitext
|
||||||
from ..scraper import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
from ..helpers import indirectStarter
|
from ..helpers import indirectStarter, bounceStarter
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
|
|
||||||
|
|
||||||
|
@ -150,6 +150,31 @@ class SluggyFreelance(_BasicScraper):
|
||||||
help = 'Index format: yymmdd'
|
help = 'Index format: yymmdd'
|
||||||
|
|
||||||
|
|
||||||
|
class SnowFlame(_BasicScraper):
|
||||||
|
url = 'http://www.snowflamecomic.com/'
|
||||||
|
stripUrl = url + '?comic=snowflame-%s-%s'
|
||||||
|
firstStripUrl = stripUrl % ('01', '01')
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(http://www\.snowflamecomic\.com/wp-content/uploads/\d+/\d+/[^"]+)'))
|
||||||
|
prevSearch = compile(tagre("span", "class", "mininav-prev") +
|
||||||
|
tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)'))
|
||||||
|
starter = bounceStarter(url,
|
||||||
|
compile(tagre("span", "class", "mininav-next") +
|
||||||
|
tagre("a", "href", r'(http://www\.snowflamecomic\.com/\?comic=snowflame[^"]+)')))
|
||||||
|
help = 'Index format: chapter-page'
|
||||||
|
|
||||||
|
def getStripIndexUrl(self, index):
|
||||||
|
return self.stripUrl % index.split('-')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
prefix, filename = imageUrl.rsplit('/', 1)
|
||||||
|
ro = compile(r'snowflame-([^-]+)-([^-]+)')
|
||||||
|
mo = ro.search(pageUrl)
|
||||||
|
chapter = mo.group(1)
|
||||||
|
page = mo.group(2)
|
||||||
|
return "%s-%s-%s" % (chapter, page, filename)
|
||||||
|
|
||||||
|
|
||||||
class SodiumEyes(_BasicScraper):
|
class SodiumEyes(_BasicScraper):
|
||||||
url = 'http://sodiumeyes.com/'
|
url = 'http://sodiumeyes.com/'
|
||||||
stripUrl = url + '%s/'
|
stripUrl = url + '%s/'
|
||||||
|
|
|
@ -57,6 +57,40 @@ class Wonderella(_BasicScraper):
|
||||||
help = 'Index format: yyyy/mm/dd/name'
|
help = 'Index format: yyyy/mm/dd/name'
|
||||||
|
|
||||||
|
|
||||||
|
class WorldOfMrToast(_BasicScraper):
|
||||||
|
baseurl = 'http://www.theimaginaryworld.com/'
|
||||||
|
url = baseurl + 'mrTcomicA.html'
|
||||||
|
stripUrl = baseurl + '%s.html'
|
||||||
|
imageSearch = compile(tagre("img", "src", r'(comic[^"]+)'))
|
||||||
|
# list the archive links since there is no prev/next navigation
|
||||||
|
prevurls = (
|
||||||
|
url,
|
||||||
|
baseurl + 'mrTcomicW02.html',
|
||||||
|
baseurl + 'mrTcomicW01.html',
|
||||||
|
baseurl + 'mrGcomic03.html',
|
||||||
|
baseurl + 'mrGcomic02.html',
|
||||||
|
baseurl + 'mrGcomic01.html',
|
||||||
|
baseurl + 'mrTcomicT05.html',
|
||||||
|
baseurl + 'mrTcomicT04.html',
|
||||||
|
baseurl + 'mrTcomicT03.html',
|
||||||
|
baseurl + 'mrTcomicT02.html',
|
||||||
|
baseurl + 'mrTcomicT01.html',
|
||||||
|
baseurl + 'mrTcomicIW3.html',
|
||||||
|
baseurl + 'mrTcomicIW2.html',
|
||||||
|
baseurl + 'mrTcomicIW1.html',
|
||||||
|
)
|
||||||
|
firstStripUrl = prevurls[-1]
|
||||||
|
multipleImagesPerStrip = True
|
||||||
|
help = 'Index format: none'
|
||||||
|
|
||||||
|
def getPrevUrl(self, url, data, baseUrl):
|
||||||
|
idx = self.prevurls.index(url)
|
||||||
|
try:
|
||||||
|
return self.prevurls[idx+1]
|
||||||
|
except IndexError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class WotNow(_BasicScraper):
|
class WotNow(_BasicScraper):
|
||||||
url = 'http://shadowburn.binmode.com/wotnow/'
|
url = 'http://shadowburn.binmode.com/wotnow/'
|
||||||
stripUrl = url + 'comic.php?comic_id=%s'
|
stripUrl = url + 'comic.php?comic_id=%s'
|
||||||
|
|
|
@ -33,9 +33,6 @@ class _BasicScraper(object):
|
||||||
# if more than one image per URL is expected
|
# if more than one image per URL is expected
|
||||||
multipleImagesPerStrip = False
|
multipleImagesPerStrip = False
|
||||||
|
|
||||||
# set of URLs that have no image (eg. only a video link)
|
|
||||||
noImageUrls = set()
|
|
||||||
|
|
||||||
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
||||||
prevUrlMatchesStripUrl = True
|
prevUrlMatchesStripUrl = True
|
||||||
|
|
||||||
|
@ -55,7 +52,7 @@ class _BasicScraper(object):
|
||||||
"""Initialize internal variables."""
|
"""Initialize internal variables."""
|
||||||
self.urls = set()
|
self.urls = set()
|
||||||
if indexes:
|
if indexes:
|
||||||
self.indexes = tuple(indexes)
|
self.indexes = tuple(sorted(indexes))
|
||||||
else:
|
else:
|
||||||
self.indexes = tuple()
|
self.indexes = tuple()
|
||||||
self.skippedUrls = set()
|
self.skippedUrls = set()
|
||||||
|
@ -66,7 +63,7 @@ class _BasicScraper(object):
|
||||||
if not isinstance(other, _BasicScraper):
|
if not isinstance(other, _BasicScraper):
|
||||||
return 1
|
return 1
|
||||||
# first, order by name
|
# first, order by name
|
||||||
d = cmp(self.get_name(), other.get_name())
|
d = cmp(self.getName(), other.getName())
|
||||||
if d != 0:
|
if d != 0:
|
||||||
return d
|
return d
|
||||||
# then by indexes
|
# then by indexes
|
||||||
|
@ -74,65 +71,41 @@ class _BasicScraper(object):
|
||||||
|
|
||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
"""Get hash value from name and index list."""
|
"""Get hash value from name and index list."""
|
||||||
return hash((self.get_name(), self.indexes))
|
return hash((self.getName(), self.indexes))
|
||||||
|
|
||||||
def getCurrentStrips(self):
|
def shouldSkipUrl(self, url):
|
||||||
"""Get current comic strip."""
|
"""Determine if search for images in given URL should be skipped."""
|
||||||
msg = 'Retrieving the current strip'
|
return False
|
||||||
if self.indexes:
|
|
||||||
msg += " for indexes %s" % self.indexes
|
|
||||||
out.info(msg+"...")
|
|
||||||
if self.indexes:
|
|
||||||
for index in self.indexes:
|
|
||||||
url = self.stripUrl % index
|
|
||||||
if url in self.noImageUrls:
|
|
||||||
self.skipUrl(url)
|
|
||||||
else:
|
|
||||||
yield self.getStrip(url)
|
|
||||||
|
|
||||||
else:
|
|
||||||
url = self.getLatestUrl()
|
|
||||||
if url in self.noImageUrls:
|
|
||||||
self.skipUrl(url)
|
|
||||||
else:
|
|
||||||
yield self.getStrip(self.getLatestUrl())
|
|
||||||
|
|
||||||
def skipUrl(self, url):
|
|
||||||
"""Document that an URL had no images."""
|
|
||||||
out.info('Skipping URL %s without image' % url)
|
|
||||||
self.skippedUrls.add(url)
|
|
||||||
|
|
||||||
def getStrip(self, url):
|
|
||||||
"""Get comic strip for given URL."""
|
|
||||||
data, baseUrl = getPageContent(url, self.session)
|
|
||||||
return self.getComicStrip(url, data, baseUrl)
|
|
||||||
|
|
||||||
def getComicStrip(self, url, data, baseUrl):
|
def getComicStrip(self, url, data, baseUrl):
|
||||||
"""Get comic strip downloader for given URL and data."""
|
"""Get comic strip downloader for given URL and data."""
|
||||||
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
|
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
|
||||||
imageUrls = set(map(self.imageUrlModifier, imageUrls))
|
imageUrls = set(map(self.imageUrlModifier, imageUrls))
|
||||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||||
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
out.warn("found %d images instead of 1 at %s with %s" % (len(imageUrls), url, self.imageSearch.pattern))
|
||||||
return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
|
elif not imageUrls:
|
||||||
|
out.warn("found no images at %s with %s" % (url, self.imageSearch.pattern))
|
||||||
|
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
|
||||||
|
|
||||||
def getAllStrips(self, maxstrips=None):
|
def getStrips(self, maxstrips=None):
|
||||||
"""Get all comic strips."""
|
"""Get comic strips."""
|
||||||
if maxstrips:
|
if maxstrips:
|
||||||
msg = 'Retrieving %d strips' % maxstrips
|
word = "strip" if maxstrips == 1 else "strips"
|
||||||
|
msg = 'Retrieving %d %s' % (maxstrips, word)
|
||||||
else:
|
else:
|
||||||
msg = 'Retrieving all strips'
|
msg = 'Retrieving all strips'
|
||||||
if self.indexes:
|
if self.indexes:
|
||||||
msg += " for indexes %s" % self.indexes
|
if len(self.indexes) == 1:
|
||||||
|
msg += " for index %s" % self.indexes[0]
|
||||||
|
else:
|
||||||
|
msg += " for indexes %s" % self.indexes
|
||||||
|
urls = [self.getIndexStripUrl(index) for index in self.indexes]
|
||||||
|
else:
|
||||||
|
urls = [self.getLatestUrl()]
|
||||||
if self.adult:
|
if self.adult:
|
||||||
msg += " (including adult content)"
|
msg += " (including adult content)"
|
||||||
out.info(msg)
|
out.info(msg)
|
||||||
if self.indexes:
|
for url in urls:
|
||||||
for index in self.indexes:
|
|
||||||
url = self.stripUrl % index
|
|
||||||
for strip in self.getStripsFor(url, maxstrips):
|
|
||||||
yield strip
|
|
||||||
else:
|
|
||||||
url = self.getLatestUrl()
|
|
||||||
for strip in self.getStripsFor(url, maxstrips):
|
for strip in self.getStripsFor(url, maxstrips):
|
||||||
yield strip
|
yield strip
|
||||||
|
|
||||||
|
@ -142,42 +115,49 @@ class _BasicScraper(object):
|
||||||
self.hitFirstStripUrl = False
|
self.hitFirstStripUrl = False
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
while url:
|
while url:
|
||||||
|
out.info('Get strip URL %s' % url, level=1)
|
||||||
data, baseUrl = getPageContent(url, self.session)
|
data, baseUrl = getPageContent(url, self.session)
|
||||||
if url in self.noImageUrls:
|
if self.shouldSkipUrl(url):
|
||||||
self.skipUrl(url)
|
out.info('Skipping URL %s' % url)
|
||||||
|
self.skippedUrls.add(url)
|
||||||
else:
|
else:
|
||||||
yield self.getComicStrip(url, data, baseUrl)
|
yield self.getComicStrip(url, data, baseUrl)
|
||||||
if self.firstStripUrl == url:
|
if self.firstStripUrl == url:
|
||||||
out.debug("Stop at first URL %s" % url)
|
out.debug("Stop at first URL %s" % url)
|
||||||
self.hitFirstStripUrl = True
|
self.hitFirstStripUrl = True
|
||||||
break
|
break
|
||||||
prevUrl = None
|
if maxstrips is not None:
|
||||||
if self.prevSearch:
|
maxstrips -= 1
|
||||||
try:
|
if maxstrips <= 0:
|
||||||
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
break
|
||||||
except ValueError as msg:
|
prevUrl = self.getPrevUrl(url, data, baseUrl)
|
||||||
# assume there is no previous URL, but print a warning
|
|
||||||
out.warn("%s Assuming no previous comic strips exist." % msg)
|
|
||||||
else:
|
|
||||||
prevUrl = self.prevUrlModifier(prevUrl)
|
|
||||||
out.debug("Matched previous URL %s" % prevUrl)
|
|
||||||
seen_urls.add(url)
|
seen_urls.add(url)
|
||||||
if prevUrl in seen_urls:
|
if prevUrl in seen_urls:
|
||||||
# avoid recursive URL loops
|
# avoid recursive URL loops
|
||||||
out.warn("Already seen previous URL %r" % prevUrl)
|
out.warn("Already seen previous URL %r" % prevUrl)
|
||||||
break
|
break
|
||||||
url = prevUrl
|
url = prevUrl
|
||||||
if maxstrips is not None:
|
|
||||||
maxstrips -= 1
|
|
||||||
if maxstrips <= 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
def setStrip(self, index):
|
def getPrevUrl(self, url, data, baseUrl):
|
||||||
"""Set current comic strip URL."""
|
"""Find previous URL."""
|
||||||
self.currentUrl = self.stripUrl % index
|
prevUrl = None
|
||||||
|
if self.prevSearch:
|
||||||
|
try:
|
||||||
|
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
||||||
|
except ValueError as msg:
|
||||||
|
# assume there is no previous URL, but print a warning
|
||||||
|
out.warn("%s Assuming no previous comic strips exist." % msg)
|
||||||
|
else:
|
||||||
|
prevUrl = self.prevUrlModifier(prevUrl)
|
||||||
|
out.debug("Matched previous URL %s" % prevUrl)
|
||||||
|
return prevUrl
|
||||||
|
|
||||||
|
def getIndexStripUrl(self, index):
|
||||||
|
"""Get comic strip URL from index."""
|
||||||
|
return self.stripUrl % index
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_name(cls):
|
def getName(cls):
|
||||||
"""Get scraper name."""
|
"""Get scraper name."""
|
||||||
if hasattr(cls, 'name'):
|
if hasattr(cls, 'name'):
|
||||||
return cls.name
|
return cls.name
|
||||||
|
@ -209,10 +189,6 @@ class _BasicScraper(object):
|
||||||
"""
|
"""
|
||||||
return imageUrl
|
return imageUrl
|
||||||
|
|
||||||
def getFilename(self, imageUrl, pageUrl):
|
|
||||||
"""Return filename for given image and page URL."""
|
|
||||||
return self.namer(imageUrl, pageUrl)
|
|
||||||
|
|
||||||
def getLatestUrl(self):
|
def getLatestUrl(self):
|
||||||
"""Get starter URL from where to scrape comic strips."""
|
"""Get starter URL from where to scrape comic strips."""
|
||||||
return self.starter()
|
return self.starter()
|
||||||
|
@ -227,7 +203,7 @@ def find_scraperclasses(comic, multiple_allowed=False):
|
||||||
candidates = []
|
candidates = []
|
||||||
cname = comic.lower()
|
cname = comic.lower()
|
||||||
for scraperclass in get_scraperclasses():
|
for scraperclass in get_scraperclasses():
|
||||||
lname = scraperclass.get_name().lower()
|
lname = scraperclass.getName().lower()
|
||||||
if lname == cname:
|
if lname == cname:
|
||||||
# perfect match
|
# perfect match
|
||||||
if not multiple_allowed:
|
if not multiple_allowed:
|
||||||
|
@ -237,7 +213,7 @@ def find_scraperclasses(comic, multiple_allowed=False):
|
||||||
elif cname in lname:
|
elif cname in lname:
|
||||||
candidates.append(scraperclass)
|
candidates.append(scraperclass)
|
||||||
if len(candidates) > 1 and not multiple_allowed:
|
if len(candidates) > 1 and not multiple_allowed:
|
||||||
comics = ", ".join(x.get_name() for x in candidates)
|
comics = ", ".join(x.getName() for x in candidates)
|
||||||
raise ValueError('multiple comics found: %s' % comics)
|
raise ValueError('multiple comics found: %s' % comics)
|
||||||
elif not candidates:
|
elif not candidates:
|
||||||
raise ValueError('comic %r not found' % comic)
|
raise ValueError('comic %r not found' % comic)
|
||||||
|
@ -266,10 +242,10 @@ def check_scrapers():
|
||||||
"""Check for duplicate scraper class names."""
|
"""Check for duplicate scraper class names."""
|
||||||
d = {}
|
d = {}
|
||||||
for scraperclass in _scraperclasses:
|
for scraperclass in _scraperclasses:
|
||||||
name = scraperclass.get_name().lower()
|
name = scraperclass.getName().lower()
|
||||||
if name in d:
|
if name in d:
|
||||||
name1 = scraperclass.get_name()
|
name1 = scraperclass.getName()
|
||||||
name2 = d[name].get_name()
|
name2 = d[name].getName()
|
||||||
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
|
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
|
||||||
d[name] = scraperclass
|
d[name] = scraperclass
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,7 @@ def has_comic(name):
|
||||||
("SmackJeeves/%s" % name).lower(),
|
("SmackJeeves/%s" % name).lower(),
|
||||||
]
|
]
|
||||||
for scraperclass in get_scraperclasses():
|
for scraperclass in get_scraperclasses():
|
||||||
lname = scraperclass.get_name().lower()
|
lname = scraperclass.getName().lower()
|
||||||
if lname in names:
|
if lname in names:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -275,7 +275,7 @@ def has_comic(name):
|
||||||
("Arcamax/%s" % name).lower(),
|
("Arcamax/%s" % name).lower(),
|
||||||
]
|
]
|
||||||
for scraperclass in get_scraperclasses():
|
for scraperclass in get_scraperclasses():
|
||||||
lname = scraperclass.get_name().lower()
|
lname = scraperclass.getName().lower()
|
||||||
if lname in names:
|
if lname in names:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -83,7 +83,7 @@ def has_creators_comic(name):
|
||||||
"""Test if comic name already exists."""
|
"""Test if comic name already exists."""
|
||||||
cname = "Creators/%s" % name
|
cname = "Creators/%s" % name
|
||||||
for scraperclass in get_scraperclasses():
|
for scraperclass in get_scraperclasses():
|
||||||
lname = scraperclass.get_name().lower()
|
lname = scraperclass.getName().lower()
|
||||||
if lname == cname.lower():
|
if lname == cname.lower():
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -407,7 +407,7 @@ def has_comic(name):
|
||||||
cname = ("Creators/%s" % name).lower()
|
cname = ("Creators/%s" % name).lower()
|
||||||
gname = ("GoComics/%s" % name).lower()
|
gname = ("GoComics/%s" % name).lower()
|
||||||
for scraperclass in get_scraperclasses():
|
for scraperclass in get_scraperclasses():
|
||||||
lname = scraperclass.get_name().lower()
|
lname = scraperclass.getName().lower()
|
||||||
if lname == cname or lname == gname:
|
if lname == cname or lname == gname:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -291,7 +291,7 @@ def has_comic(name):
|
||||||
"""Check if comic name already exists."""
|
"""Check if comic name already exists."""
|
||||||
cname = name.lower()
|
cname = name.lower()
|
||||||
for scraperclass in get_scraperclasses():
|
for scraperclass in get_scraperclasses():
|
||||||
lname = scraperclass.get_name().lower()
|
lname = scraperclass.getName().lower()
|
||||||
if lname == cname:
|
if lname == cname:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -1,6 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: iso-8859-1 -*-
|
||||||
# Dosage, the webcomic downloader
|
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
@ -394,7 +393,8 @@ class MyRegister (register, object):
|
||||||
args = dict(
|
args = dict(
|
||||||
name = AppName,
|
name = AppName,
|
||||||
version = AppVersion,
|
version = AppVersion,
|
||||||
description = 'a commandline webcomic downloader and archiver',
|
description = 'a commandline comic downloader and archiver',
|
||||||
|
keywords = 'comic,webcomic,downloader,archiver',
|
||||||
author = 'Tristan Seligmann, Jonathan Jacobs, Bastian Kleineidam',
|
author = 'Tristan Seligmann, Jonathan Jacobs, Bastian Kleineidam',
|
||||||
author_email = 'bastian.kleineidam@web.de',
|
author_email = 'bastian.kleineidam@web.de',
|
||||||
maintainer = 'Bastian Kleineidam',
|
maintainer = 'Bastian Kleineidam',
|
||||||
|
|
|
@ -8,7 +8,7 @@ class TestComicNames(TestCase):
|
||||||
|
|
||||||
def test_names(self):
|
def test_names(self):
|
||||||
for scraperclass in scraper.get_scraperclasses():
|
for scraperclass in scraper.get_scraperclasses():
|
||||||
name = scraperclass.get_name()
|
name = scraperclass.getName()
|
||||||
self.assertTrue(name.count('/') <= 1, name)
|
self.assertTrue(name.count('/') <= 1, name)
|
||||||
if '/' in name:
|
if '/' in name:
|
||||||
comicname = name.split('/')[1]
|
comicname = name.split('/')[1]
|
||||||
|
|
|
@ -34,7 +34,7 @@ class _ComicTester(TestCase):
|
||||||
scraperclass=None
|
scraperclass=None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.name = self.scraperclass.get_name()
|
self.name = self.scraperclass.getName()
|
||||||
self.url = self.scraperclass.starter()
|
self.url = self.scraperclass.starter()
|
||||||
# create a temporary directory for images
|
# create a temporary directory for images
|
||||||
self.tmpdir = tempfile.mkdtemp()
|
self.tmpdir = tempfile.mkdtemp()
|
||||||
|
@ -64,7 +64,7 @@ class _ComicTester(TestCase):
|
||||||
def _test_comic(self, scraperobj):
|
def _test_comic(self, scraperobj):
|
||||||
num = 0
|
num = 0
|
||||||
max_strips = 5
|
max_strips = 5
|
||||||
for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
|
for strip in scraperobj.getStrips(max_strips):
|
||||||
images = []
|
images = []
|
||||||
for image in strip.getImages():
|
for image in strip.getImages():
|
||||||
images.append(image.url)
|
images.append(image.url)
|
||||||
|
@ -122,7 +122,7 @@ def generate_comic_testers():
|
||||||
g = globals()
|
g = globals()
|
||||||
if "TRAVIS" in os.environ:
|
if "TRAVIS" in os.environ:
|
||||||
# Get limited number of scraper tests on Travis builds.
|
# Get limited number of scraper tests on Travis builds.
|
||||||
max_scrapers = 1500
|
max_scrapers = 500
|
||||||
scraperclasses = islice(scraper.get_scraperclasses(), 0, max_scrapers)
|
scraperclasses = islice(scraper.get_scraperclasses(), 0, max_scrapers)
|
||||||
else:
|
else:
|
||||||
scraperclasses = scraper.get_scraperclasses()
|
scraperclasses = scraper.get_scraperclasses()
|
||||||
|
|
|
@ -52,6 +52,11 @@ class TestDosage (unittest.TestCase):
|
||||||
self.assertRaises(OSError, run_with_options, [])
|
self.assertRaises(OSError, run_with_options, [])
|
||||||
self.assertRaises(OSError, run_with_options, ['--imadoofus'])
|
self.assertRaises(OSError, run_with_options, ['--imadoofus'])
|
||||||
|
|
||||||
def test_fetch(self):
|
def test_fetch_html(self):
|
||||||
run_with_options(["-n", "2", "-b", self.tmpdir, "-o", "html", "calvinandhobbes"])
|
run_with_options(["-n", "2", "-b", self.tmpdir, "-o", "html", "calvinandhobbes"])
|
||||||
|
|
||||||
|
def test_fetch_rss(self):
|
||||||
run_with_options(["--numstrips", "2", "--baseurl", "bla", "--basepath", self.tmpdir, "--output", "rss", "--adult", "sexyloser"])
|
run_with_options(["--numstrips", "2", "--baseurl", "bla", "--basepath", self.tmpdir, "--output", "rss", "--adult", "sexyloser"])
|
||||||
|
|
||||||
|
def test_fetch_indexed(self):
|
||||||
|
run_with_options(["-n", "2", "-b", self.tmpdir, "calvinandhobbes:2012/02/02"])
|
||||||
|
|
Loading…
Reference in a new issue