Fix comics.

This commit is contained in:
Bastian Kleineidam 2012-12-04 07:02:40 +01:00
parent 45df462a47
commit 387dff79a9
33 changed files with 372 additions and 241 deletions

22
dosage
View file

@ -98,15 +98,19 @@ def getComics(options, comics):
else: else:
strips = scraperobj.getCurrentStrips() strips = scraperobj.getCurrentStrips()
first = True first = True
for strip in strips: try:
_errors, skipped = saveComicStrip(strip, options.basepath) for strip in strips:
errors += _errors _errors, skipped = saveComicStrip(strip, options.basepath)
if not first and scraperobj.indexes: errors += _errors
# stop when indexed retrieval skipped all images for one if not first and scraperobj.indexes:
# comie strip (except the first one) # stop when indexed retrieval skipped all images for one
out.write("Stop retrieval because image file already exists") # comie strip (except the first one)
break out.write("Stop retrieval because image file already exists")
first = False break
first = False
except IOError as msg:
out.write('Error getting strip: %s' % msg)
errors += 1
events.getHandler().end() events.getHandler().end()
return errors return errors

View file

@ -1,10 +1,7 @@
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
import re
from .util import fetchUrl, getQueryParams from .util import fetchUrl, getQueryParams
from .scraper import _BasicScraper
def queryNamer(paramName, usePageUrl=False): def queryNamer(paramName, usePageUrl=False):
"""Get name from URL query part.""" """Get name from URL query part."""
@ -81,19 +78,3 @@ class IndirectLatestMixin(object):
latestUrl = property(getLatestUrl) latestUrl = property(getLatestUrl)
class _PHPScraper(_BasicScraper):
"""
Scraper for comics using phpComic/CUSP.
This provides an easy way to define scrapers for webcomics using phpComic.
"""
imageUrl = property(lambda self: self.basePath + 'daily.php?date=%s')
imageSearch = property(lambda self: re.compile(r'<img alt=[^>]+ src="(%scomics/\d{6}\..+?)">' % (self.basePath,)))
help = 'Index format: yymmdd'
@classmethod
def starter(cls):
"""Get starter URL."""
return cls.basePath + cls.latestUrl

View file

@ -5,8 +5,8 @@
from re import compile from re import compile
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..helpers import bounceStarter, indirectStarter from ..helpers import bounceStarter
from ..util import tagre, getQueryParams from ..util import tagre
class CaptainSNES(_BasicScraper): class CaptainSNES(_BasicScraper):
@ -144,37 +144,6 @@ class Curvy(_BasicScraper):
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
def cloneManga(name, shortName, lastStrip=None):
url = 'http://manga.clone-army.org'
baseUrl = '%s/%s.php' % (url, shortName)
def namer(self, imageUrl, pageUrl):
return '%03d' % int(getQueryParams(pageUrl)['page'][0])
attrs = dict(
name='CloneManga/' + name,
stripUrl = baseUrl + '?page=%s',
imageSearch=compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (url, shortName), after="center")),
prevSearch=compile(tagre("a", "href", r'([^"]+)')+tagre("img", "src", r"previous\.gif")),
help='Index format: n',
namer=namer,
)
if lastStrip is None:
attrs['starter'] = indirectStarter(baseUrl, compile(tagre("a", "href", r'([^"]+)')+tagre("img", "src", r"last\.gif")))
else:
attrs['latestUrl'] = attrs['stripUrl'] % lastStrip
return type('CloneManga_%s' % name, (_BasicScraper,), attrs)
anm = cloneManga('AprilAndMay', 'anm')
kanami = cloneManga('Kanami', 'kanami')
momoka = cloneManga('MomokaCorner', 'momoka')
nana = cloneManga('NanasEverydayLife', 'nana', '78')
pxi = cloneManga('PaperEleven', 'pxi', '311')
t42r = cloneManga('Tomoyo42sRoom', 't42r')
penny = cloneManga('PennyTribute', 'penny')
class CatAndGirl(_BasicScraper): class CatAndGirl(_BasicScraper):
latestUrl = 'http://catandgirl.com/' latestUrl = 'http://catandgirl.com/'
stripUrl = latestUrl + '?p=%s' stripUrl = latestUrl + '?p=%s'

View file

@ -0,0 +1,61 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
from re import compile
from ..scraper import make_scraper
from ..util import tagre, getQueryParams, fetchUrl
_linkTag = tagre("a", "href", r'([^"]+)')
_prevSearch = compile(_linkTag + tagre("img", "src", r"previous\.gif"))
_nextSearch = compile(_linkTag + tagre("img", "src", r"next\.gif"))
_lastSearch = compile(_linkTag + tagre("img", "src", r"last\.gif"))
def add(name, shortName, imageFolder=None, lastStrip=None):
classname = 'CloneManga_%s' % name
_url = 'http://manga.clone-army.org'
baseUrl = '%s/%s.php' % (_url, shortName)
if imageFolder is None:
imageFolder = shortName
@classmethod
def namer(cls, imageUrl, pageUrl):
return '%03d' % int(getQueryParams(pageUrl)['page'][0])
@classmethod
def _starter(cls):
# first, try hopping to previous and next comic
url = fetchUrl(baseUrl, _prevSearch)
if not url:
# no previous link found, try hopping to last comic
url = fetchUrl(baseUrl, _lastSearch)
if not url:
raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, baseUrl))
return url
url = fetchUrl(url, _nextSearch)
if not url:
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
return url
attrs = dict(
name='CloneManga/' + name,
stripUrl = baseUrl + '?page=%s',
imageSearch=compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (_url, imageFolder), after="center")),
prevSearch=_prevSearch,
help='Index format: n',
namer=namer,
)
if lastStrip is None:
attrs['starter'] = _starter
else:
attrs['latestUrl'] = attrs['stripUrl'] % lastStrip
globals()[classname] = make_scraper(classname, **attrs)
add('AprilAndMay', 'anm', imageFolder='AAM')
add('Kanami', 'kanami')
add('MomokaCorner', 'momoka')
add('NanasEverydayLife', 'nana', lastStrip='78')
add('PaperEleven', 'pxi', imageFolder='papereleven', lastStrip='311')
add('Tomoyo42sRoom', 't42r')
add('PennyTribute', 'penny')

View file

@ -6,25 +6,26 @@ from re import compile
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..helpers import indirectStarter from ..helpers import indirectStarter
from ..util import tagre, getQueryParams from ..util import tagre
class DMFA(_BasicScraper): class DMFA(_BasicScraper):
latestUrl = 'http://www.missmab.com/' latestUrl = 'http://www.missmab.com/'
stripUrl = latestUrl + 'Comics/Vol_%s.php' stripUrl = latestUrl + 'Comics/Vol_%s.php'
imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)')) imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"])+')+ multipleImagesPerStrip = True
tagre("img", "src", r'(?:../)?Images/comicprev.gif')) prevSearch = compile(tagre("a", "href", r'((?:Comics/)?Vol[^"]+)')+
tagre("img", "src", r'(?:../)?Images/comicprev\.gif'))
help = 'Index format: nnn (normally, some specials)' help = 'Index format: nnn (normally, some specials)'
class DandyAndCompany(_BasicScraper): class DandyAndCompany(_BasicScraper):
latestUrl = 'http://www.dandyandcompany.com/' latestUrl = 'http://www.dandyandcompany.com/'
stripUrl = latestUrl + '%s' stripUrl = None
imageSearch = compile(tagre("img", "src", r'([^"]*/strips/[^"]+)')) multipleImagesPerStrip = True
prevSearch = compile(r'<a href="(.*)" class="prev"') imageSearch = compile(tagre("a", "href", r'(http://\d+\.bp\.blogspot\.com/[^"]+)', after="imageanchor"))
help = 'Index format: yyyy/mm/dd' prevSearch = compile(tagre("a", "href", r"([^']+)", quote="'", after="Older Posts"))
help = 'Index format: none'
class DarkWings(_BasicScraper): class DarkWings(_BasicScraper):
@ -63,11 +64,11 @@ class DrFun(_BasicScraper):
latestUrl = 'http://www.ibiblio.org/Dave/ar00502.htm' latestUrl = 'http://www.ibiblio.org/Dave/ar00502.htm'
stripUrl = 'http://www.ibiblio.org/Dave/ar%s.htm' stripUrl = 'http://www.ibiblio.org/Dave/ar%s.htm'
imageSearch = compile(r'<A HREF= "(Dr-Fun/df\d{6}/df.+?)">') imageSearch = compile(r'<A HREF= "(Dr-Fun/df\d{6}/df.+?)">')
multipleImagesPerStrip = True
prevSearch = compile(r'<A HREF="(.+?)">Previous Week,') prevSearch = compile(r'<A HREF="(.+?)">Previous Week,')
help = 'Index format: nnnnn' help = 'Index format: nnnnn'
class Dracula(_BasicScraper): class Dracula(_BasicScraper):
latestUrl = 'http://draculacomic.net/' latestUrl = 'http://draculacomic.net/'
stripUrl = latestUrl + 'comic.php?comicID=%s' stripUrl = latestUrl + 'comic.php?comicID=%s'
@ -76,7 +77,6 @@ class Dracula(_BasicScraper):
help = 'Index format: nnn' help = 'Index format: nnn'
class DragonTails(_BasicScraper): class DragonTails(_BasicScraper):
latestUrl = 'http://www.dragon-tails.com/' latestUrl = 'http://www.dragon-tails.com/'
stripUrl = latestUrl + 'archive.php?date=%s' stripUrl = latestUrl + 'archive.php?date=%s'
@ -96,7 +96,7 @@ class DreamKeepersPrelude(_BasicScraper):
class Drowtales(_BasicScraper): class Drowtales(_BasicScraper):
latestUrl = 'http://www.drowtales.com/mainarchive.php' latestUrl = 'http://www.drowtales.com/mainarchive.php'
stripUrl = latestUrl + '?sid=%s' stripUrl = latestUrl + '?sid=%s'
imageSearch = compile(tagre("img", "src", r'("http://www.drowtales.com/mainarchive/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://www\.drowtales\.com/mainarchive/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(\?sid=\d+)', before="link_prev_top")) prevSearch = compile(tagre("a", "href", r'(\?sid=\d+)', before="link_prev_top"))
help = 'Index format: number' help = 'Index format: number'
@ -105,7 +105,8 @@ class DieselSweeties(_BasicScraper):
latestUrl = 'http://www.dieselsweeties.com/' latestUrl = 'http://www.dieselsweeties.com/'
stripUrl = latestUrl + 'archive/%s' stripUrl = latestUrl + 'archive/%s'
imageSearch = compile(tagre("img", "src", r'(/hstrips/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(/hstrips/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/archive/\d+)') + tagre("img", "src", r'http://www\.dieselsweeties\.com/ximages/blackbackarrow160.png')) prevSearch = compile(tagre("a", "href", r'(/archive/\d+)') +
tagre("img", "src", r'(?:http://www\.dieselsweeties\.com/ximages/blackbackarrow160.png|/ximages/prev\.gif)'))
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
@classmethod @classmethod
@ -118,14 +119,10 @@ class DieselSweeties(_BasicScraper):
class DominicDeegan(_BasicScraper): class DominicDeegan(_BasicScraper):
latestUrl = 'http://www.dominic-deegan.com/' latestUrl = 'http://www.dominic-deegan.com/'
stripUrl = latestUrl + 'view.php?date=%s' stripUrl = latestUrl + 'view.php?date=%s'
imageSearch = compile(r'<img src="(.+?save-as=.+?)" alt') imageSearch = compile(tagre("img", "src", r'(comics/\d+\.gif)'))
prevSearch = compile(r'"(view.php\?date=.+?)".+?prev21') prevSearch = compile(r'"(view.php\?date=.+?)".+?prev21')
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
@classmethod
def namer(cls, imageUrl, pageUrl):
return getQueryParams(imageUrl)['save-as'][0].rsplit('.', 1)[0]
class DorkTower(_BasicScraper): class DorkTower(_BasicScraper):
latestUrl = 'http://www.dorktower.com/' latestUrl = 'http://www.dorktower.com/'

View file

@ -35,7 +35,7 @@ def add(name):
return url return url
url = fetchUrl(url, _nextSearch) url = fetchUrl(url, _nextSearch)
if not url: if not url:
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, _url)) raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
return url return url
globals()[classname] = make_scraper(classname, globals()[classname] = make_scraper(classname,

View file

@ -20,16 +20,10 @@ class EerieCuties(_BasicScraper):
class Eriadan(_BasicScraper): class Eriadan(_BasicScraper):
latestUrl = 'http://www.shockdom.com/webcomics/eriadan/' latestUrl = 'http://www.shockdom.com/webcomics/eriadan/'
stripUrl = latestUrl + '%s' stripUrl = latestUrl + '%s'
# XXX fix image search imageSearch = compile(tagre("img", "src", r'(http://www\.shockdom\.com/webcomics/eriadan/files/[^"]+)', after='alt=""'))
imageSearch = compile(r'title="[^"]+?" src="http://www\.shockdom\.com/eriadan/(wp-content/uploads/.+?)"')
prevSearch = compile(tagre("a", "href", r'([^"]+)', after="prev")) prevSearch = compile(tagre("a", "href", r'([^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/nnn (unpadded)' help = 'Index format: yyyy/mm/dd/nnn (unpadded)'
@classmethod
def namer(cls, imageUrl, pageUrl):
return '%d' % (int(compile(r'p=(\d+)').search(pageUrl).group(1)))
class ElGoonishShive(_BasicScraper): class ElGoonishShive(_BasicScraper):
name = 'KeenSpot/ElGoonishShive' name = 'KeenSpot/ElGoonishShive'
@ -40,7 +34,6 @@ class ElGoonishShive(_BasicScraper):
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
class ElGoonishShiveNP(_BasicScraper): class ElGoonishShiveNP(_BasicScraper):
name = 'KeenSpot/ElGoonishShiveNP' name = 'KeenSpot/ElGoonishShiveNP'
latestUrl = 'http://www.egscomics.com/egsnp/' latestUrl = 'http://www.egscomics.com/egsnp/'
@ -52,12 +45,10 @@ class ElGoonishShiveNP(_BasicScraper):
class EmergencyExit(_BasicScraper): class EmergencyExit(_BasicScraper):
latestUrl = 'http://www.eecomics.net/' latestUrl = 'http://www.eecomics.net/'
stripUrl = None stripUrl = latestUrl + "?strip_id=%s"
imageSearch = compile(r'"(comics/.+?)"') imageSearch = compile(r'"(comics/.+?)"')
prevSearch = compile(r'START.+?"(.+?)"') prevSearch = compile(tagre("a", "href", r'(\?strip_id=\d+)') + tagre("img", "alt", r"Prior"))
# XXX ? help = 'Index format: n'
help = 'God help us now!'
class ErrantStory(_BasicScraper): class ErrantStory(_BasicScraper):
@ -102,7 +93,7 @@ class EvilInc(_BasicScraper):
class Exiern(_BasicScraper): class Exiern(_BasicScraper):
latestUrl = 'http://www.exiern.com/' latestUrl = 'http://www.exiern.com/'
stripUrl = latestUrl + '?p=%s' stripUrl = latestUrl + '?p=%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.exiern\.com/comics/[^"])')) imageSearch = compile(tagre("img", "src", r'(http://www\.exiern\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.exiern\.com/[^"]+)', after="prev")) prevSearch = compile(tagre("a", "href", r'(http://www\.exiern\.com/[^"]+)', after="prev"))
help = 'Index format: n' help = 'Index format: n'
@ -154,7 +145,6 @@ class ExploitationNow(_BasicScraper):
class Ellerbisms(_BasicScraper): class Ellerbisms(_BasicScraper):
latestUrl = 'http://www.ellerbisms.com/' latestUrl = 'http://www.ellerbisms.com/'
stripUrl = latestUrl + '?p=%s' stripUrl = latestUrl + '?p=%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.ellerbisms\.com/wp-content/uploads/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://www\.ellerbisms\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.ellerbisms\.com/[^"]+)', after="prev")) prevSearch = compile(tagre("a", "href", r'(http://www\.ellerbisms\.com/[^"]+)', after="prev"))
help = 'Index format: nnn' help = 'Index format: nnn'

View file

@ -49,12 +49,12 @@ class FlakyPastry(_BasicScraper):
prevSearch = compile(r'<a href="(.+?)".+?btn_back') prevSearch = compile(r'<a href="(.+?)".+?btn_back')
help = 'Index format: nnnn' help = 'Index format: nnnn'
# XXX move to keenspot
class Flipside(_BasicScraper): class Flipside(_BasicScraper):
latestUrl = 'http://flipside.keenspot.com/comic.php' latestUrl = 'http://flipside.keenspot.com/comic.php'
stripUrl = latestUrl + '?i=%s' stripUrl = latestUrl + '?i=%s'
imageSearch = compile(r'<IMG SRC="(comic/.+?)"') imageSearch = compile(tagre("img", "src", r'(http://cdn\.flipside\.keenspot\.com/comic/[^"]+)'))
prevSearch = compile(r'<A HREF="(comic.php\?i=\d+?)">&lt') prevSearch = compile(tagre("a", "href", r'(http://flipside\.keenspot\.com/comic\.php\?i=\d+)', after="prev"))
help = 'Index format: nnnn' help = 'Index format: nnnn'
@ -114,7 +114,8 @@ class FredoAndPidjin(_BasicScraper):
homepage = 'http://www.pidjin.net/' homepage = 'http://www.pidjin.net/'
stripUrl = None stripUrl = None
help = 'Index format: yyyy/mm/dd/name' help = 'Index format: yyyy/mm/dd/name'
imageSearch = compile(tagre('img', 'src', '(http://cdn\.pidjin\.net/wp-content/uploads/\d\d\d\d/\d\d/\d+[^"]+\.png)')) imageSearch = compile(tagre('img', 'src', '(http://cdn\.pidjin\.net/wp-content/uploads/\d+/\d+/[^"]+\.png)'))
multipleImagesPerStrip = True
prevSearch = compile(tagre('a', 'href', '([^"]+)')+"Prev</a>") prevSearch = compile(tagre('a', 'href', '([^"]+)')+"Prev</a>")
starter = indirectStarter(homepage, starter = indirectStarter(homepage,
compile(tagre('a', 'href', "("+homepage+r'\d\d\d\d/\d\d/\d\d/[^"]+/)'))) compile(tagre('a', 'href', "("+homepage+r'\d\d\d\d/\d\d/\d\d/[^"]+/)')))

View file

@ -12,7 +12,7 @@ _prevSearch = compile(r' <a href="(http://www\.thefallenangel\.co\.uk/.+?)"><img
def add(name, shortname): def add(name, shortname):
latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
classname = asciify(name) classname = "FallenAngel_" + asciify(name)
globals()[classname] = make_scraper(classname, globals()[classname] = make_scraper(classname,
latestUrl = latestUrl, latestUrl = latestUrl,
stripUrl = latestUrl + '?date=%s', stripUrl = latestUrl + '?date=%s',

View file

@ -34,17 +34,9 @@ class GUComics(_BasicScraper):
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
class GenrezvousPoint(_BasicScraper):
latestUrl = 'http://www.genrezvouspoint.com/'
stripUrl = latestUrl + 'index.php?comicID=%s'
imageSearch = compile(r'<img src=\'(comics/.+?)\'')
prevSearch = compile(r' <a[^>]+?href="(.+?)">PREVIOUS</a>')
help = 'Index format: nnn'
class GirlGenius(_BasicScraper): class GirlGenius(_BasicScraper):
latestUrl = 'http://girlgeniusonline.com/comic.php' latestUrl = 'http://girlgeniusonline.com/comic.php'
stripUrl = 'http://www.girlgeniusonline.com/comic.php?date=%s' stripUrl = latestUrl + '?date=%s'
imageSearch = compile(r"(/ggmain/strips/.+?)'") imageSearch = compile(r"(/ggmain/strips/.+?)'")
prevSearch = compile(r"</a> <a href=.+?(/comic.php\?date=.+?)'.+?Previous") prevSearch = compile(r"</a> <a href=.+?(/comic.php\?date=.+?)'.+?Previous")
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@ -99,7 +91,8 @@ class Gunshow(_BasicScraper):
latestUrl = 'http://gunshowcomic.com/' latestUrl = 'http://gunshowcomic.com/'
stripUrl = latestUrl + '%s' stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://gunshowcomic\.com/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://gunshowcomic\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]+)') + tagre("img", "src", r'[^"]+menu/small/previous\.gif')) multipleImagesPerStrip = True
prevSearch = compile(tagre("a", "href", r'([^"]+)') + tagre("img", "src", r'[^"]*menu/small/previous\.gif'))
help = 'Index format: n' help = 'Index format: n'
@ -131,7 +124,7 @@ class GlassHalfEmpty(_BasicScraper):
latestUrl = 'http://www.defectivity.com/ghe/index.php' latestUrl = 'http://www.defectivity.com/ghe/index.php'
stripUrl = latestUrl + '?strip_id=%s' stripUrl = latestUrl + '?strip_id=%s'
imageSearch = compile(r'src="(comics/.+?)"') imageSearch = compile(r'src="(comics/.+?)"')
prevSearch = compile(r'</a><a href="(.+?)"><img src="\.\./images/onback\.jpg"') prevSearch = compile(tagre("a", "href", r'(\?strip_id=\d+)') + tagre("img", "src", r'\.\./images/arrowbuttons/onback\.jpg'))
help = 'Index format: nnn' help = 'Index format: nnn'

View file

@ -11,12 +11,3 @@ class HorribleVille(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/d/[^"]+)') + tagre("img", "src", r'/images/previous\.png')) prevSearch = compile(tagre("a", "href", r'(/d/[^"]+)') + tagre("img", "src", r'/images/previous\.png'))
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
class HelpDesk(_BasicScraper):
latestUrl = 'https://www.eviscerati.org/comics?page=78'
stripUrl = 'https://www.eviscerati.org/comics?page=%s'
imageSearch = compile(tagre("img", "src", r'(https://www\.eviscerati\.org/files/comics/[^"]+)'))
prevSearch = compile(tagre("li", "class", r'pager-previous[^"]+') + tagre("a", "href", r'(/comics\?page=%d+)'))
help = 'Index format: n'

View file

@ -9,9 +9,9 @@ from ..util import tagre
class IDreamOfAJeanieBottle(_BasicScraper): class IDreamOfAJeanieBottle(_BasicScraper):
latestUrl = 'http://jeaniebottle.com/' latestUrl = 'http://jeaniebottle.com/'
stripUrl = latestUrl + 'review.php?comicID=' stripUrl = latestUrl + '?p=%s'
imageSearch = compile(r'(/comics/.+?)"') imageSearch = compile(r'(/comics/.+?)"')
prevSearch = compile(r'First".+?(review.php.+?)".+?prev_a.gif') prevSearch = compile(tagre("a", "href", r'(http://jeaniebottle\.com/\?p=\d+)', after="prev"))
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'

View file

@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
from re import compile, MULTILINE from re import compile
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..util import tagre from ..util import tagre
@ -21,12 +21,3 @@ class JoeAndMonkey(_BasicScraper):
imageSearch = compile(r'"(/comic/[^"]+)"') imageSearch = compile(r'"(/comic/[^"]+)"')
prevSearch = compile(r"<a href='(/\d+)'>Previous") prevSearch = compile(r"<a href='(/\d+)'>Previous")
help = 'Index format: nnn' help = 'Index format: nnn'
class JoyOfTech(_BasicScraper):
latestUrl = 'http://www.geekculture.com/joyoftech/'
stripUrl = latestUrl + 'joyarchives/%s.html'
imageSearch = compile(tagre("img", "src", r'(joyimages/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(joyarchives/[^"]+)') + r'.+?Previous', MULTILINE)
help = 'Index format: nnn'

View file

@ -4,7 +4,7 @@
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..util import tagre
class Key(_BasicScraper): class Key(_BasicScraper):
latestUrl = 'http://key.shadilyn.com/latestpage.html' latestUrl = 'http://key.shadilyn.com/latestpage.html'
@ -25,7 +25,7 @@ class Krakow(_BasicScraper):
class Kukuburi(_BasicScraper): class Kukuburi(_BasicScraper):
latestUrl = 'http://www.kukuburi.com/current/' latestUrl = 'http://www.kukuburi.com/current/'
stripUrl = 'http://www.kukuburi.com/v2/%s/' stripUrl = 'http://www.kukuburi.com/v2/%s/'
imageSearch = compile(r'img src="(http://www.kukuburi.com/../comics/.+?)"') imageSearch = compile(tagre("img", "src", r'(http://www\.kukuburi\.com/v2/comics/[^"]+)', after='alt="[^"]'))
prevSearch = compile(r'nav-previous.+?"(http.+?)"') prevSearch = compile(r'nav-previous.+?"(http.+?)"')
help = 'Index format: yyyy/mm/dd/stripname' help = 'Index format: yyyy/mm/dd/stripname'

View file

@ -16,16 +16,6 @@ class LasLindas(_BasicScraper):
help = 'Index format: stripname' help = 'Index format: stripname'
class LesbianPiratesFromOuterSpace(_BasicScraper):
latestUrl = 'http://rosalarian.com/lesbianpirates/'
stripUrl = latestUrl + 'index.php?p=%s'
imageSearch = compile(tagre("img", "src", r'("comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/index\.php\?id=\d+)', after="prev"))
help = 'Index format: n'
class Lint(_BasicScraper): class Lint(_BasicScraper):
latestUrl = 'http://www.purnicellin.com/lint/' latestUrl = 'http://www.purnicellin.com/lint/'
stripUrl = latestUrl + '%s' stripUrl = latestUrl + '%s'
@ -34,7 +24,6 @@ class Lint(_BasicScraper):
help = 'Index format: yyyy/mm/dd/num-name' help = 'Index format: yyyy/mm/dd/num-name'
class LookingForGroup(_BasicScraper): class LookingForGroup(_BasicScraper):
latestUrl = 'http://www.lfgcomic.com/page/latest' latestUrl = 'http://www.lfgcomic.com/page/latest'
stripUrl = 'http://www.lfgcomic.com/page/%s' stripUrl = 'http://www.lfgcomic.com/page/%s'
@ -51,8 +40,8 @@ class LookingForGroup(_BasicScraper):
class LittleGamers(_BasicScraper): class LittleGamers(_BasicScraper):
latestUrl = 'http://www.little-gamers.com/' latestUrl = 'http://www.little-gamers.com/'
stripUrl = latestUrl + '%s/' stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://www\.little-gamers\.com/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://little-gamers\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www.little-gamers.com/[^"]+)', before="comic-nav-prev-link")) prevSearch = compile(tagre("a", "href", r'(http://www\.little-gamers.com/[^"]+)', before="comic-nav-prev-link"))
help = 'Index format: yyyy/mm/dd/name' help = 'Index format: yyyy/mm/dd/name'

View file

@ -67,8 +67,8 @@ class Melonpool(_BasicScraper):
class Misfile(_BasicScraper): class Misfile(_BasicScraper):
latestUrl = 'http://www.misfile.com/' latestUrl = 'http://www.misfile.com/'
stripUrl = latestUrl + '?date=%s' stripUrl = latestUrl + '?date=%s'
imageSearch = compile(tagre("img", "src", r'(comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r"(comics/[^']+)", quote="'"))
prevSearch = compile(tagre("link", "href", r'([^"]+)', before="Previous")) prevSearch = compile(tagre("link", "href", r"([^']+)", quote="'", before="Previous"))
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
@ -76,15 +76,6 @@ class MysteriesOfTheArcana(_BasicScraper):
latestUrl = 'http://mysteriesofthearcana.com/' latestUrl = 'http://mysteriesofthearcana.com/'
stripUrl = latestUrl + 'index.php?action=comics&cid=%s' stripUrl = latestUrl + 'index.php?action=comics&cid=%s'
imageSearch = compile(tagre("img", "src", r'(image\.php\?type=com&i=[^"]+)')) imageSearch = compile(tagre("img", "src", r'(image\.php\?type=com&i=[^"]+)'))
prevSearch = compile(tagre("a", "href", r'()', after="navprevius")) prevSearch = compile(tagre("a", "href", r'(index\.php[^"]+)', after="navprevious"))
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
# XXX move to keenspot?
class MysticRevolution(_BasicScraper):
latestUrl = 'http://mysticrevolution.keenspot.com/'
stripUrl = latestUrl + '?cid=%s'
imageSearch = compile(tagre("img", "src", r'(http://cdn\.mysticrevolution\.keenspot\.com/comics/[^"]+)'))
prevSearch = compile(tagre("link", "rel", r'(\?cid=\d+)', before="prev"))
help = 'Index format: n (unpadded)'

View file

@ -4,14 +4,14 @@
from re import compile from re import compile
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..helpers import indirectStarter, _PHPScraper from ..helpers import indirectStarter, bounceStarter
from ..util import tagre from ..util import tagre
class NamirDeiter(_BasicScraper): class NamirDeiter(_BasicScraper):
latestUrl = 'http://www.namirdeiter.com/' latestUrl = 'http://www.namirdeiter.com/'
stripUrl = latestUrl + 'comics/index.php?date=%s' stripUrl = latestUrl + 'comics/index.php?date=%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.namirdeiter\.com/comics/\d\.jpg)', quote="")) imageSearch = compile(tagre("img", "src", r"'?(http://www\.namirdeiter\.com/comics/\d+\.jpg)'?", quote=""))
prevSearch = compile(tagre("a", "href", r'(http://www\.namirdeiter\.com/comics/index\.php\?date=\d+)', quote="'")+"Previous") prevSearch = compile(tagre("a", "href", r'(http://www\.namirdeiter\.com/comics/index\.php\?date=\d+)', quote="'")+"Previous")
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@ -63,17 +63,19 @@ class Nukees(_BasicScraper):
help = 'Index format: yyyymmdd.html' help = 'Index format: yyyymmdd.html'
class NekoTheKitty(_BasicScraper):
class NekoTheKitty(_PHPScraper): basePath = 'http://www.nekothekitty.net/'
basePath = 'http://www.nekothekitty.net/cusp/' stripUrl = basePath + 'comics/%s'
latestUrl = basePath starter = bounceStarter(basePath, compile(tagre("a", "href", r'(http://www\.nekothekitty\.net/comics/[^"]+)') +
prevSearch = compile(tagre("a", "href", r'(http://www.nekothekitty.net/comics/[^"]+)') + tagre("img", "src", r'http://www\.nekothekitty\.net/files/smallnext.png')))
imageSearch = compile(tagre("img", "src", r'(http://(?:img\d+|www)\.smackjeeves\.com/images/uploaded/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.nekothekitty\.net/comics/[^"]+)') +
tagre("img", "src", r'http://www\.nekothekitty\.net/files/smallprev.png')) tagre("img", "src", r'http://www\.nekothekitty\.net/files/smallprev.png'))
help = 'Index format: n/n-name'
class NichtLustig(_BasicScraper): class NichtLustig(_BasicScraper):
stripUrl = 'http://www.nichtlustig.de/toondb/%s.html' stripUrl = 'http://static.nichtlustig.de/toondb/%s.html'
imageSearch = compile('background-image:url\((http://static\.nichtlustig\.de/comics/full/\d+\.jpg)') imageSearch = compile('background-image:url\((http://static\.nichtlustig\.de/comics/full/\d+\.jpg)')
prevSearch = compile(tagre("a", "href", r'(http://static\.nichtlustig\.de/toondb/\d+\.html)')) prevSearch = compile(tagre("a", "href", r'(http://static\.nichtlustig\.de/toondb/\d+\.html)'))
help = 'Index format: yymmdd' help = 'Index format: yymmdd'
@ -101,6 +103,7 @@ class NekkoAndJoruba(_BasicScraper):
class NobodyScores(_BasicScraper): class NobodyScores(_BasicScraper):
latestUrl = 'http://nobodyscores.loosenutstudio.com/' latestUrl = 'http://nobodyscores.loosenutstudio.com/'
stripUrl = latestUrl + 'index.php?id=%s' stripUrl = latestUrl + 'index.php?id=%s'
imageSearch = compile(r'><img src="(http://nobodyscores\.loosenutstudio\.com/comix/.+?)"') imageSearch = compile(tagre("img", "src", r'(http://nobodyscores\.loosenutstudio\.com/comix/[^"]+)'))
multipleImagesPerStrip = True
prevSearch = compile(r'<a href="(http://nobodyscores\.loosenutstudio\.com/index.php.+?)">the one before </a>') prevSearch = compile(r'<a href="(http://nobodyscores\.loosenutstudio\.com/index.php.+?)">the one before </a>')
help = 'Index format: nnn' help = 'Index format: nnn'

View file

@ -28,7 +28,7 @@ class OddFish(_BasicScraper):
class OnTheEdge(_BasicScraper): class OnTheEdge(_BasicScraper):
latestUrl = 'http://ontheedgecomics.com/' latestUrl = 'http://ontheedgecomics.com/'
stripUrl = 'http://ontheedgecomics.com/comic/ote%s' stripUrl = 'http://ontheedgecomics.com/comic/%s'
imageSearch = compile(r'<img src="(http://ontheedgecomics.com/comics/.+?)"') imageSearch = compile(r'<img src="(http://ontheedgecomics.com/comics/.+?)"')
prevSearch = compile(r'<a href="([^"]+)" rel="prev">') prevSearch = compile(r'<a href="([^"]+)" rel="prev">')
help = 'Index format: nnn (unpadded)' help = 'Index format: nnn (unpadded)'

View file

@ -12,11 +12,10 @@ class PartiallyClips(_BasicScraper):
latestUrl = 'http://partiallyclips.com/' latestUrl = 'http://partiallyclips.com/'
stripUrl = latestUrl + '%s/' stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://partiallyclips\.com/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://partiallyclips\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://partiallyclips\.com/[^"]+)', before="prev")) prevSearch = compile(tagre("a", "href", r'(http://partiallyclips\.com/[^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/stripname' help = 'Index format: yyyy/mm/dd/stripname'
class PastelDefender(_BasicScraper): class PastelDefender(_BasicScraper):
latestUrl = 'http://www.pasteldefender.com/coverbackcover.html' latestUrl = 'http://www.pasteldefender.com/coverbackcover.html'
stripUrl = 'http://www.pasteldefender.com/%s.html' stripUrl = 'http://www.pasteldefender.com/%s.html'
@ -25,7 +24,6 @@ class PastelDefender(_BasicScraper):
help = 'Index format: nnn' help = 'Index format: nnn'
class PebbleVersion(_BasicScraper): class PebbleVersion(_BasicScraper):
latestUrl = 'http://www.pebbleversion.com/' latestUrl = 'http://www.pebbleversion.com/'
stripUrl = latestUrl + 'Archives/Strip%s.html' stripUrl = latestUrl + 'Archives/Strip%s.html'
@ -37,7 +35,7 @@ class PebbleVersion(_BasicScraper):
class PennyAndAggie(_BasicScraper): class PennyAndAggie(_BasicScraper):
baseUrl = 'http://www.pennyandaggie.com/' baseUrl = 'http://www.pennyandaggie.com/'
stripUrl = baseUrl + 'index.php?p=%s' stripUrl = baseUrl + 'index.php?p=%s'
imageSearch = compile(tagre("a", "href", r'(http://www\.pennyandaggie\.com/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://www\.pennyandaggie\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r"(index\.php\?p=\d+)", quote="'") + prevSearch = compile(tagre("a", "href", r"(index\.php\?p=\d+)", quote="'") +
tagre("img", "src", r'http://pennyandaggie\.com/images/previous_day\.gif', quote="")) tagre("img", "src", r'http://pennyandaggie\.com/images/previous_day\.gif', quote=""))
starter = indirectStarter(baseUrl, prevSearch) starter = indirectStarter(baseUrl, prevSearch)
@ -47,20 +45,19 @@ class PennyAndAggie(_BasicScraper):
class PennyArcade(_BasicScraper): class PennyArcade(_BasicScraper):
baseUrl = 'http://penny-arcade.com/comic/' baseUrl = 'http://penny-arcade.com/comic/'
starter = bounceStarter(baseUrl, starter = bounceStarter(baseUrl,
compile(tagre("a", "href", r'(http://penny-arcade\.com/comic/[^"]+)', before="bntNext")) compile(tagre("a", "href", r'(http://penny-arcade\.com/comic/[^"]+)', before="btnNext"))
) )
stripUrl = baseUrl + '%s/' stripUrl = baseUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://art\.penny-arcade\.com/photos/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://art\.penny-arcade\.com/photos/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://penny-arcade\.com/comic/[^"]+)', before="bntPrev")) prevSearch = compile(tagre("a", "href", r'(http://penny-arcade\.com/comic/[^"]+)', before="btnPrev"))
help = 'Index format: yyyy/mm/dd' help = 'Index format: yyyy/mm/dd'
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):
yyyy, mm, dd = pageUrl.split('/')[-4:-1] dummy, yyyy, mm, dd = pageUrl.rsplit('/', 3)
return '%04d%02d%02d' % (int(yyyy), int(mm), int(dd)) return '%04d%02d%02d' % (int(yyyy), int(mm), int(dd))
class PeppermintSaga(_BasicScraper): class PeppermintSaga(_BasicScraper):
latestUrl = 'http://www.pepsaga.com/' latestUrl = 'http://www.pepsaga.com/'
stripUrl = latestUrl + '?p=%s' stripUrl = latestUrl + '?p=%s'
@ -101,7 +98,7 @@ class Precocious(_BasicScraper):
class PvPonline(_BasicScraper): class PvPonline(_BasicScraper):
latestUrl = 'http://pvponline.com/comic' latestUrl = 'http://pvponline.com/comic'
stripUrl = latestUrl + '%s' stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://newcdn\.pvponline\.com/img/comic/pvp\d+\.jpg)')) imageSearch = compile(tagre("img", "src", r'(http://newcdn\.pvponline\.com/img/comic/pvp[^"]+\.jpg)'))
prevSearch = compile(tagre("a", "href", r'(http://pvponline\.com/comic/[^"]+)', after="Previous")) prevSearch = compile(tagre("a", "href", r'(http://pvponline\.com/comic/[^"]+)', after="Previous"))
help = 'Index format: yyyy/mm/dd/stripname' help = 'Index format: yyyy/mm/dd/stripname'
@ -135,7 +132,7 @@ evilish = pensAndTales('Evilish', 'http://evilish.pensandtales.com/')
class ProperBarn(_BasicScraper): class ProperBarn(_BasicScraper):
latestUrl = 'http://www.nitrocosm.com/go/gag/' latestUrl = 'http://www.nitrocosm.com/go/gag/'
stripUrl = latestUrl + '%s/' stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://content\.nitrocosm\.com/gag/\d+.png)')) imageSearch = compile(tagre("img", "src", r'(http://content\.nitrocosm\.com/gag/\d+\.[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.nitrocosm\.com/go/gag/\d+/)', after="nav_btn_previous")) prevSearch = compile(tagre("a", "href", r'(http://www\.nitrocosm\.com/go/gag/\d+/)', after="nav_btn_previous"))
help = 'Index format: nnn' help = 'Index format: nnn'

View file

@ -19,7 +19,7 @@ class RadioactivePanda(_BasicScraper):
# XXX add other comics at http://petitesymphony.com/comics/ # XXX add other comics at http://petitesymphony.com/comics/
class Rascals(_BasicScraper): class Rascals(_BasicScraper):
latestUrl = 'http://rascals.petitesymphony.com/' latestUrl = 'http://rascals.petitesymphony.com/'
stripUrl = latestUrl + '/comic/rascals-pg-%s/' stripUrl = latestUrl + 'comic/rascals-pg-%s/'
imageSearch = compile(tagre("img", "src", r'(http://rascals\.petitesymphony\.com/files/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://rascals\.petitesymphony\.com/files/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://rascals\.petitesymphony\.com/comic/[^"]+)', after="Previous")) prevSearch = compile(tagre("a", "href", r'(http://rascals\.petitesymphony\.com/comic/[^"]+)', after="Previous"))
help = 'Index format: num' help = 'Index format: num'
@ -36,7 +36,7 @@ class RealLife(_BasicScraper):
class RedString(_BasicScraper): class RedString(_BasicScraper):
latestUrl = 'http://www.redstring.strawberrycomics.com/' latestUrl = 'http://www.redstring.strawberrycomics.com/'
stripUrl = latestUrl + 'index.php?id=%s' stripUrl = latestUrl + 'index.php?id=%s'
imageSearch = compile(tagre("img", "src", r'("comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/index\.php\?id=\d+)', after="prev")) prevSearch = compile(tagre("a", "href", r'(/index\.php\?id=\d+)', after="prev"))
help = 'Index format: nnn' help = 'Index format: nnn'

View file

@ -10,7 +10,7 @@ from ..util import tagre
class SailorsunOrg(_BasicScraper): class SailorsunOrg(_BasicScraper):
latestUrl = 'http://www.sailorsun.org/' latestUrl = 'http://sailorsun.org/'
stripUrl = latestUrl + '?p=%s' stripUrl = latestUrl + '?p=%s'
imageSearch = compile(tagre("img", "src", r'(http://sailorsun\.org/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://sailorsun\.org/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://sailorsun\.org/\?p=\d+)', after="prev")) prevSearch = compile(tagre("a", "href", r'(http://sailorsun\.org/\?p=\d+)', after="prev"))
@ -28,7 +28,7 @@ class SamAndFuzzy(_BasicScraper):
class SarahZero(_BasicScraper): class SarahZero(_BasicScraper):
latestUrl = 'http://www.sarahzero.com/' latestUrl = 'http://www.sarahzero.com/'
stripUrl = latestUrl + 'sz_%s.html' stripUrl = latestUrl + 'sz_%s.html'
imageSearch = compile(tagre("img", "src", r'(z_spreads/sz_[^"]+)')) imageSearch = compile(tagre("img", "src", r'(z_(?:spreads|decoy)/sz_[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(sz_\d+\.html)') + tagre("img", "src", r'z_site/sz_05_nav\.gif')) prevSearch = compile(tagre("a", "href", r'(sz_\d+\.html)') + tagre("img", "src", r'z_site/sz_05_nav\.gif'))
help = 'Index format: nnnn' help = 'Index format: nnnn'
@ -45,7 +45,8 @@ class SchlockMercenary(_BasicScraper):
latestUrl = 'http://www.schlockmercenary.com/' latestUrl = 'http://www.schlockmercenary.com/'
stripUrl = latestUrl + '%s' stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://static\.schlockmercenary\.com/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://static\.schlockmercenary\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/d+)', after="nav-previous")) multipleImagesPerStrip = True
prevSearch = compile(tagre("a", "href", r'(/\d+-\d+-\d+)', quote="'", after="nav-previous"))
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
@ -102,7 +103,7 @@ class SluggyFreelance(_BasicScraper):
class SodiumEyes(_BasicScraper): class SodiumEyes(_BasicScraper):
latestUrl = 'http://sodiumeyes.com/' latestUrl = 'http://sodiumeyes.com/'
stripUrl = latestUrl + '%s/' stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://sodiumeyes\.com/comic/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://sodiumeyes\.com/comic/[^ ]+)', quote=""))
prevSearch = compile(tagre("a", "href", r'(http://sodiumeyes\.com/[^"]+)', after="prev")) prevSearch = compile(tagre("a", "href", r'(http://sodiumeyes\.com/[^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/stripname' help = 'Index format: yyyy/mm/dd/stripname'
@ -110,9 +111,9 @@ class SodiumEyes(_BasicScraper):
class SpareParts(_BasicScraper): class SpareParts(_BasicScraper):
baseUrl = 'http://www.sparepartscomics.com/' baseUrl = 'http://www.sparepartscomics.com/'
latestUrl = baseUrl + 'comics/?date=20080328' latestUrl = baseUrl + 'comics/?date=20080328'
stripUrl = baseUrl + 'comics/?date=s%' stripUrl = baseUrl + 'comics/index.php?date=%s'
imageSearch = compile(tagre("img", "src", r'http://www\.sparepartscomics\.com/comics/[^"]+')) imageSearch = compile(tagre("img", "src", r'(http://www\.sparepartscomics\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(index\.php\?date=\d+)') + "Previous Comic") prevSearch = compile(tagre("a", "href", r'(index\.php\?date=\d+)', quote="'") + "Previous Comic")
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@ -127,7 +128,7 @@ class Stubble(_BasicScraper):
class StrawberryDeathCake(_BasicScraper): class StrawberryDeathCake(_BasicScraper):
latestUrl = 'http://strawberrydeathcake.com/' latestUrl = 'http://strawberrydeathcake.com/'
stripUrl = latestUrl + 'archive/%s/' stripUrl = latestUrl + 'archive/%s/'
imageSearch = compile(tagre("img", "src", r'http://strawberrydeathcake\.com/wp-content/webcomic/[^"]+')) imageSearch = compile(tagre("img", "src", r'(http://strawberrydeathcake\.com/wp-content/webcomic/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://strawberrydeathcake\.com/archive/[^"]+)', after="previous")) prevSearch = compile(tagre("a", "href", r'(http://strawberrydeathcake\.com/archive/[^"]+)', after="previous"))
help = 'Index format: stripname' help = 'Index format: stripname'
@ -144,7 +145,8 @@ class SomethingPositive(_BasicScraper):
latestUrl = 'http://www.somethingpositive.net/' latestUrl = 'http://www.somethingpositive.net/'
stripUrl = latestUrl + 'sp%s.shtml' stripUrl = latestUrl + 'sp%s.shtml'
imageSearch = compile(tagre("img", "src", r'(sp\d+\.png)')) imageSearch = compile(tagre("img", "src", r'(sp\d+\.png)'))
prevSearch = compile(tagre("a", "href", r'(sp\d+\.shtml)') + "Previous") prevSearch = compile(tagre("a", "href", r'(sp\d+\.shtml)') +
"(?:" + tagre("img", "src", r'images/previous\.gif') + "|Previous)")
help = 'Index format: mmddyyyy' help = 'Index format: mmddyyyy'
@classmethod @classmethod
@ -152,7 +154,6 @@ class SomethingPositive(_BasicScraper):
return pageUrl.split('/')[-1].split('.')[0] return pageUrl.split('/')[-1].split('.')[0]
class SexyLosers(_BasicScraper): class SexyLosers(_BasicScraper):
stripUrl = 'http://www.sexylosers.com/%s.html' stripUrl = 'http://www.sexylosers.com/%s.html'
imageSearch = compile(r'<img src\s*=\s*"\s*(comics/[\w\.]+?)"', IGNORECASE) imageSearch = compile(r'<img src\s*=\s*"\s*(comics/[\w\.]+?)"', IGNORECASE)
@ -172,7 +173,7 @@ class SexyLosers(_BasicScraper):
class StarCrossdDestiny(_BasicScraper): class StarCrossdDestiny(_BasicScraper):
latestUrl = 'http://www.starcrossd.net/comic.html' latestUrl = 'http://www.starcrossd.net/comic.html'
stripUrl = 'http://www.starcrossd.net/archives/%s.html' stripUrl = 'http://www.starcrossd.net/archives/%s.html'
imageSearch = compile(r'<img src="(http://www\.starcrossd\.net/(?:ch1|strips|book2)/[^"]+)">') imageSearch = compile(tagre("img", "src", r'(http://www\.starcrossd\.net/(?:ch1|strips|book2)/[^"]+)'))
prevSearch = compile(r'<a href="(http://www\.starcrossd\.net/(?:ch1/)?archives/\d+\.html)"[^>]*"[^"]*"[^>]*>prev', IGNORECASE) prevSearch = compile(r'<a href="(http://www\.starcrossd\.net/(?:ch1/)?archives/\d+\.html)"[^>]*"[^"]*"[^>]*>prev', IGNORECASE)
help = 'Index format: nnnnnnnn' help = 'Index format: nnnnnnnn'
@ -212,11 +213,3 @@ class SMBC(_BasicScraper):
prevSearch = compile(r'131,13,216,84"\n\s+href="(.+?)#comic"\n>', MULTILINE) prevSearch = compile(r'131,13,216,84"\n\s+href="(.+?)#comic"\n>', MULTILINE)
help = 'Index format: nnnn' help = 'Index format: nnnn'
class SomethingLikeLife(_BasicScraper):
latestUrl = 'http://www.pulledpunches.com/'
stripUrl = latestUrl + '?p=%s'
imageSearch = compile(r'<img src="(http://www.pulledpunches.com/comics/[^"]*)"')
prevSearch = compile(r'</a> <a href="(http://www.pulledpunches.com/\?p=[^"]*)"><img src="back1.gif"')
help = 'Index format: nn'

View file

@ -10,7 +10,7 @@ from ..util import tagre
class TheNoob(_BasicScraper): class TheNoob(_BasicScraper):
latestUrl = 'http://www.thenoobcomic.com/index.php' latestUrl = 'http://www.thenoobcomic.com/index.php'
stripUrl = latestUrl + '?pos=%' stripUrl = latestUrl + '?pos=%s'
imageSearch = compile(tagre("img", "src", r'(/headquarters/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(/headquarters/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(\?pos=\d+)', before="comic_nav_previous_button")) prevSearch = compile(tagre("a", "href", r'(\?pos=\d+)', before="comic_nav_previous_button"))
help = 'Index format: nnnn' help = 'Index format: nnnn'
@ -19,7 +19,7 @@ class TheNoob(_BasicScraper):
class TheOrderOfTheStick(_BasicScraper): class TheOrderOfTheStick(_BasicScraper):
latestUrl = 'http://www.giantitp.com/comics/oots0863.html' latestUrl = 'http://www.giantitp.com/comics/oots0863.html'
stripUrl = latestUrl + 'comics/oots%s.html' stripUrl = 'http://www.giantitp.com/comics/oots%s.html'
imageSearch = compile(r'<IMG src="(/comics/images/.+?)">') imageSearch = compile(r'<IMG src="(/comics/images/.+?)">')
prevSearch = compile(r'<A href="(/comics/oots\d{4}\.html)"><IMG src="/Images/redesign/ComicNav_Back.gif"') prevSearch = compile(r'<A href="(/comics/oots\d{4}\.html)"><IMG src="/Images/redesign/ComicNav_Back.gif"')
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
@ -31,7 +31,8 @@ class TheParkingLotIsFull(_BasicScraper):
latestUrl = 'http://plif.courageunfettered.com/archive/arch2002.htm' latestUrl = 'http://plif.courageunfettered.com/archive/arch2002.htm'
stripUrl = 'http://plif.courageunfettered.com/archive/arch%s.htm' stripUrl = 'http://plif.courageunfettered.com/archive/arch%s.htm'
imageSearch = compile(r'<td align="center"><A TARGET=_parent HREF="(wc\d+\..+?)">') imageSearch = compile(r'<td align="center"><A TARGET=_parent HREF="(wc\d+\..+?)">')
prevSearch = compile(r'-\s*\n\s*<A HREF="(arch\d{4}\.htm)">\d{4}</A>') multipleImagesPerStrip = True
prevSearch = compile(r'\d{4} -\s+<A HREF="(arch\d{4}\.htm)">\d{4}')
help = 'Index format: nnn' help = 'Index format: nnn'
@ -40,7 +41,7 @@ class TheWotch(_BasicScraper):
latestUrl = 'http://www.thewotch.com/' latestUrl = 'http://www.thewotch.com/'
stripUrl = latestUrl + '?date=%s' stripUrl = latestUrl + '?date=%s'
imageSearch = compile(r"<img.+?src='(comics/.+?)'") imageSearch = compile(r"<img.+?src='(comics/.+?)'")
prevSearch = compile(r"<link rel='Previous' href='(\?date=\d+-\d+-\d+)'") prevSearch = compile(r"<link rel='Previous' href='(/\?date=\d+-\d+-\d+)'")
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'

View file

@ -6,12 +6,12 @@ from re import compile
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..helpers import bounceStarter, indirectStarter from ..helpers import bounceStarter, indirectStarter
from ..util import getQueryParams from ..util import getQueryParams, tagre
class Undertow(_BasicScraper): class Undertow(_BasicScraper):
stripUrl = 'http://undertow.dreamshards.org/%s' stripUrl = 'http://undertow.dreamshards.org/%s'
imageSearch = compile(r'<img src="(.+?)"') imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)'))
prevSearch = compile(r'href="(.+?)".+?teynpoint') prevSearch = compile(r'href="(.+?)".+?teynpoint')
help = 'Index format: good luck !' help = 'Index format: good luck !'
starter = indirectStarter('http://undertow.dreamshards.org/', starter = indirectStarter('http://undertow.dreamshards.org/',

View file

@ -36,6 +36,7 @@ class WhyTheLongFace(_BasicScraper):
latestUrl = 'http://www.absurdnotions.org/wtlf200709.html' latestUrl = 'http://www.absurdnotions.org/wtlf200709.html'
stripUrl = 'http://www.absurdnotions.org/wtlf%s.html' stripUrl = 'http://www.absurdnotions.org/wtlf%s.html'
imageSearch = compile(r'<img src="(http://www.absurdnotions.org/wtlf.+?|lf\d+.\w{1,4})"', IGNORECASE) imageSearch = compile(r'<img src="(http://www.absurdnotions.org/wtlf.+?|lf\d+.\w{1,4})"', IGNORECASE)
multipleImagesPerStrip = True
prevSearch = compile(r'HREF="(.+?)"><IMG SRC="nprev.gif" ') prevSearch = compile(r'HREF="(.+?)"><IMG SRC="nprev.gif" ')
help = 'Index format: yyyymm' help = 'Index format: yyyymm'
@ -66,7 +67,7 @@ class WorldOfWarcraftEh(_BasicScraper):
class Wulffmorgenthaler(_BasicScraper): class Wulffmorgenthaler(_BasicScraper):
latestUrl = 'http://wumocomicstrip.com/' latestUrl = 'http://wumocomicstrip.com/'
stripUrl = latestUrl + '%s/' stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(/img/strip/thumb/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(/img/strip/[^/"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]+)') + "<span>Previous") prevSearch = compile(tagre("a", "href", r'([^"]+)') + "<span>Previous")
help = 'Index format: yyyy/mm/dd' help = 'Index format: yyyy/mm/dd'

View file

@ -2,14 +2,15 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
from re import compile, IGNORECASE from re import compile
from ..util import tagre
from ..scraper import make_scraper from ..scraper import make_scraper
from ..helpers import bounceStarter from ..helpers import bounceStarter
_imageSearch = compile(r'SRC="(http://www\.wlpcomics\.com/adult/.+?|http://www\.wlpcomics\.com/general/.+?)"', IGNORECASE) _imageSearch = compile(tagre("img", "src", r'(http://www\.wlpcomics\.com/(?:adult|general)/[^"]+)'))
_prevSearch = compile(r'</a> <A HREF="(\w+.html)">Previous Page</a>', IGNORECASE) _prevSearch = compile(tagre("a", "href", r'(\w+.html)') + 'Previous')
_nextSearch = compile(r'</a> <A HREF="(\w+.html)">Next Page</a>', IGNORECASE) _nextSearch = compile(tagre("a", "href", r'(\w+.html)') + 'Next')
def add(name, path): def add(name, path):
@ -35,4 +36,3 @@ add('ChichiChan', 'adult/chichi/')
add('ChocolateMilkMaid', 'adult/cm/') add('ChocolateMilkMaid', 'adult/cm/')
add('MaidAttack', 'general/maidattack/') add('MaidAttack', 'general/maidattack/')
add('ShadowChasers', 'general/shadowchasers/') add('ShadowChasers', 'general/shadowchasers/')
add('Stellar', 'adult/stellar/')

View file

@ -19,8 +19,8 @@ class YAFGC(_BasicScraper):
class YouSayItFirst(_BasicScraper): class YouSayItFirst(_BasicScraper):
latestUrl = 'http://www.yousayitfirst.com/' latestUrl = 'http://www.yousayitfirst.com/'
stripUrl = latestUrl + 'comics/index.php?date=%s' stripUrl = latestUrl + 'comics/index.php?date=%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.yousayitfirst\.com/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r"(http://www\.yousayitfirst\.com/comics/[^>']+)", quote="'?"))
prevSearch = compile(tagre("a", "href", r'(http://www\.yousayitfirst\.com/comics/index\.php\?date=\d+)') + "Previous") prevSearch = compile(tagre("a", "href", r'(http://www\.yousayitfirst\.com/comics/index\.php\?date=\d+)', quote="'") + "Previous")
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'

View file

@ -17,6 +17,11 @@ class Zapiro(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'(http://mg\.co\.za/cartoon/[^"]+)')+"Older") prevSearch = compile(tagre("a", "href", r'(http://mg\.co\.za/cartoon/[^"]+)')+"Older")
help = 'Index format: yyyy-mm-dd-stripname' help = 'Index format: yyyy-mm-dd-stripname'
@classmethod
def namer(cls, imageUrl, pageUrl):
name = imageUrl.split('/')[-3]
return name
class ZombieHunters(_BasicScraper): class ZombieHunters(_BasicScraper):
latestUrl = 'http://www.thezombiehunters.com/' latestUrl = 'http://www.thezombiehunters.com/'

View file

@ -78,6 +78,7 @@ class _BasicScraper(object):
while url: while url:
imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch) imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
prevUrl = self.prevUrlModifier(prevUrl) prevUrl = self.prevUrlModifier(prevUrl)
out.write("Matched previous URL %s" % prevUrl, 2)
seen_urls.add(url) seen_urls.add(url)
yield self.getComicStrip(url, imageUrls) yield self.getComicStrip(url, imageUrls)
# avoid recursive URL loops # avoid recursive URL loops

View file

@ -163,7 +163,7 @@ def normaliseURL(url):
pu = list(urlparse.urlparse(url)) pu = list(urlparse.urlparse(url))
segments = pu[2].split('/') segments = pu[2].split('/')
while segments and segments[0] == '': while segments and segments[0] in ('', '..'):
del segments[0] del segments[0]
pu[2] = '/' + '/'.join(segments).replace(' ', '%20') pu[2] = '/' + '/'.join(segments).replace(' ', '%20')
# remove leading '&' from query # remove leading '&' from query

View file

@ -16,14 +16,137 @@ json_file = __file__.replace(".py", ".json")
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
"Twonks_and_Plonkers", # broken images, no real content
"U_Chuu_No_Hoshi_Hotoshi_Tsuko", # broken images
"Red_Dog_Venue", # start page is broken
"Monster_Lover", # start page is broken "Monster_Lover", # start page is broken
"Legacy_of_Blaze", # broken images "Legacy_of_Blaze", # broken images
"Dead_Strangers", # broken images "Dead_Strangers", # broken images
"Crack", # broken images "Crack", # broken images
"Iron_Wolf", # broken images "Iron_Wolf", # broken images
"A_Call_to_Destiny__NC_17", # start page requires login
"A_Call_to_Destiny_Reloaded", # start page requires login
"A_Day_in_the_Life_for_Erik", # broken images
"A_Fairly_Twisted_Reality", # start page requires login
"Al_and_Scout", # broken images
"ANGELOU_____Las_aventuras_de_Nikole", # broken images
"Apartment_408_Full_Size", # broken images
"Apple_Valley", # broken images
"Apt_408_Minis", # broken images
"atxs", # broken images
"A_Word_Of_Wisdom", # broken images
"Brathalla", # broken images
"Binary_Souls_Other_Dimensions", # broken images
"BK_Shattered_Hate", # broken images
"Chomp", # broken images
"Chu_and_Kenny", # broken images
"Coga_Suro_2", # broken images
"Creepy_Girl_and_Her_Zombie_Dog", # broken images
"CuoreVoodoo", # broken images
"dairyaire", # broken images
"DIS", # broken images
"Dot_TXT", # broken images
"Dreadnought_Invasion_Six", # broken images
"Emerald_Winter", # broken images
"Enter_the_Duck_2", # broken images
"ffff", # broken images
"Function_Over_Fashion", # broken images
"Funday_Morning", # broken images
"greys_journey", # broken images
"Head_over_Heart", # broken images
"Hurrocks_Fardel", # broken images
"Bhaddland", # start page requires login
"Bouncing_Orbs_of_Beauty", # start page requires login
"Busty_Solar", # start page requires login
"Illusional_Beauty", # broken images
"Indigo_Bunting__Vampire", # start page requires login
"Irrumator", # start page requires login
"Its_A_Boy_Thing", # start page requires login
"Kokuahiru_comics", # start page requires login
"Inside_OuT", # broken images
"Journey_to_Raifina", # broken images
"KALA_dan", # broken images
"Live_to_tell", # start page requires login
"Locoma", # broken images
"London_Underworld", # broken images
"Louder_Than_Bombs", # broken images
"Lucky_Dawg", # broken images
"Mario_in_Johto", # broken images
"Master", # start page requires login
"Mastermind_BTRN", # broken images
"MAYA_____The_legend_of_Wolf", # broken images
"Megaman_Zero", # broken images
"Monster_Lover_Destinys_Path", # start page requires login
"M_Organ_Art", # start page requires login
"Morning_Squirtz", # start page requires login
"MOSAIC", # broken images
"My_Angel_and_My_Devil", # broken images
"Nemution_Jewel", # start page requires login
"Nemution_Redux", # start page requires login
"New_Pages", # broken images
"Ninja_Shizatch", # broken images
"Normalcy_is_for_Wimps", # broken images
"MIKYAGU", # broken images
"One_Third_Of_Your_Life_Is_Spent_Sleeping_One_Third_Of_Your_Life_Is_Spent_Working_And_Half_Of_One_Third_Is_Spent_Waiting_The_Question_Is_It_Really_Your_Life", # broken images
"OTENBA_Files", # start page requires login
"Panacea", # start page requires login
"Parker_Lot", # broken images
"Peter_And_The_Wolf", # start page requires login
"Perspectives", # broken images
"Pokemon_Sinnoh_Surfer", # broken images
"Pokemon_World_Trainers", # broken images
"Potpourri_of_Lascivious_Whimsy", # start page requires login
"Pr0nCrest", # start page requires login
"punished_girls", # start page requires login
"Powerjeff", # broken images
"Comicarotica", # start page requires login
"Dark_Sisters", # start page requires login
"Death_P0rn", # start page requires login
"Dreams_in_Synergy", # broken images
"GNight_Shade", # start page requires login
"GRIND", # start page requires login
"HUSS", # start page requires login
"Red_Dog_Venue", # start page is broken
"rubber_girls", # start page requires login
"Robomeks", # broken images
"Robot_Friday", # broken images
"SFA", # start page requires login
"Shadow_Root", # start page requires login
"Shiro_Karasu", # start page requires login
"Shelter_of_Wings", # broken images
"Some_Notes", # broken images
"Sonic_Advanced_Online", # broken images
"Sonic_and_tails_corner", # broken images
"Sonic_Unreal", # broken images
"Tales_of_Schlock", # start page requires login
"Splices_of_Life", # broken images
"STARSEARCHERS", # broken images
"Ted_The_Terrible_Superhero", # broken images
"Terra_online_comic", # broken images
"The_Auragon_Base", # broken images
"The_Bend", # broken images
"The_Chronicles_of_Drew", # broken images
"The_Devils_Horn", # broken images
"The_Dragon_and_the_Lemur", # start page requires login
"The_Fighting_Stranger", # broken images
"The_Mighty_Omega", # broken images
"The_Misadventures_of_Everyone", # start page requires login
"The_NEW_Life_Of_TimmY", # broken images
"The_SSA", # broken images
"Tony_The_Hedgehog", # broken images
"Trapped_in_a_Comic", # start page requires login
"Unsound_of_Mind", # broken images
"Vampire_Chronicles__Dark_Lust", # start page requires login
"WarMage", # start page requires login
"Watashi_No_Ame", # broken images
"Weave", # broken images
"Weirdlings", # template error
"Welcome_To_Border_City", # broken images
"what_comes_first", # start page requires login
"Within_Shadows", # broken images
"Xolta", # start page requires login
"XTIN__The_Dragons_Dream_World", # start page requires login
"X_UP", # start page requires login
"Zandars_Saga", # start page requires login
"Twonks_and_Plonkers", # broken images, no real content
"U_Chuu_No_Hoshi_Hotoshi_Tsuko", # broken images
] ]

View file

@ -21,6 +21,22 @@ url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
"FrikkFrakkAndFrank", # too few comics "FrikkFrakkAndFrank", # too few comics
"Apocalypseharry", # too few comics
"BatkidandBatrat", # too few comics
"BETWEENTHELINES", # comic unavailable
"Bonner", # missing page
"Buster", # comic unavailabe
"DALTONDOG", # comic unavailable
"DellAndSteve", # too few comics
"Dilbert", # redirect
"InkeeDoodles", # comic unavailable
"MaggiesComics", # too few comics
"OfMiceandMud", # too few comics
"OysterWar", # too few comics
"PIGTIMES", # comic unavailable
"PS", # comic unavailable
"SherpaAid", # comic unavailable
"SparComics", # comic unavailable
] ]

View file

@ -21,27 +21,59 @@ num_matcher = re.compile(r'Number of Days: (\d+)')
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
"10", # page is gone
"54sinRed", # page is 403 forbidden
"6D4", # redirected to another page
"AaaSoCAwesomenessandaSliceofCheese", # broken images
"AcrossthePond", # page moved
"ACDeceptibotscomic", # no images
"AdamandSei", # page has 403 forbidden
"AdamsRoadGang", # page is gone
"ADVENTURERS", # page is gone
"AiYaiYai", # page moved
"AlltheCommies", # missing images
"AltaModaMetro", # page redirected
"AltarGirl", # page redirected
"Amerika", # no images
"Angels", # page has 403 forbidden
"AngryDMonkey", # page redirected
"Angst", # page redirected
"Animenifesto", # too few images
"Anna", # no images
"Arcana", # archive broken
"Area15", # no images
"BaidheTu", # no images
"BasilFlint", # page redirected
"beerkada", # no images
"BelovedLeader", # broken images
"BigMouthComics", # page does not follow standard layout
"", # page is gone
"", # page is gone
"", # page is gone
"BlueZombie", # broken page
"BoomerExpress", # redirection to another page
"DungeonDamage", # page does not follow standard layout
"EarthRiser", # redirects to a new page
"FaultyLogic", # page does not follow standard layout
"GoForIt", # page is gone
"JuvenileDiversion", # page moved "JuvenileDiversion", # page moved
"JustWeird", # page has 403 forbidden "JustWeird", # page has 403 forbidden
"Michikomonogatari", # page does not follow standard layout
"MobileMadness", # page does not follow standard layout "MobileMadness", # page does not follow standard layout
"KnightsOfTheNexus", # page does not follow standard layout "KnightsOfTheNexus", # page does not follow standard layout
"RogerAndDominic", # page does not follow standard layout "RogerAndDominic", # page does not follow standard layout
"TheAvatar", # page does not follow standard layout
"Michikomonogatari", # page does not follow standard layout
"DungeonDamage", # page does not follow standard layout
"SaveMeGebus", # page does not follow standard layout "SaveMeGebus", # page does not follow standard layout
"BlueZombie", # broken page "TheAvatar", # page does not follow standard layout
"BoomerExpress", # redirection to another page
"FaultyLogic", # page does not follow standard layout
"EarthRiser", # redirects to a new page
"GoForIt", # page is gone
"ACDeceptibotscomic", # no images
] ]
# links to last valid strips
url_overrides = { url_overrides = {
# link to last valid strip
"BallofYarn": "http://ballofyarn.comicgenesis.com/d/20020624.html", "BallofYarn": "http://ballofyarn.comicgenesis.com/d/20020624.html",
"AmazonSpaceRangers": "http://amazons.comicgenesis.com/d/20051015.html",
"ArroganceinSimplicity": "http://arrogance.comicgenesis.com/d/20030217.html",
"ATasteofEvil": "http://atasteofevil.comicgenesis.com/d/20050314.html",
"": "",
"": "",
} }
def handle_url(url, res): def handle_url(url, res):

View file

@ -33,30 +33,31 @@ class _ComicTester(TestCase):
# at least 5 strips from the start, and find strip images # at least 5 strips from the start, and find strip images
# on at least 4 pages. # on at least 4 pages.
scraperobj = self.scraperclass() scraperobj = self.scraperclass()
num = empty = 0 num = 0
max_strips = 5 max_strips = 5
for strip in islice(scraperobj.getAllStrips(), 0, max_strips): for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
images = 0 images = 0
for image in strip.getImages(): for image in strip.getImages():
images += 1 images += 1
self.save(image) self.save(image)
if images == 0: self.check(images > 0, 'failed to find images at %s' % strip.stripUrl)
empty += 1 if not self.scraperclass.multipleImagesPerStrip:
self.check(images == 1, 'found %d instead of 1 image at %s' % (images, strip.stripUrl))
if num > 0: if num > 0:
self.check_stripurl(strip) self.check_stripurl(strip)
num += 1 num += 1
if self.scraperclass.prevSearch: if self.scraperclass.prevSearch:
self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num) self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern at %s.' % (num, strip.stripUrl))
# check that at exactly or for multiple pages at least 5 images are saved # Check that exactly or for multiple pages at least 5 images are saved.
# This is different than the image number check above since it checks saved files,
# ie. it detects duplicate filenames.
saved_images = self.get_saved_images() saved_images = self.get_saved_images()
num_images = len(saved_images) num_images = len(saved_images)
attrs = (num_images, saved_images, max_strips, self.tmpdir)
if self.scraperclass.multipleImagesPerStrip: if self.scraperclass.multipleImagesPerStrip:
self.check(num_images >= max_strips, self.check(num_images >= max_strips, 'saved %d %s instead of at least %d images in %s' % attrs)
'saved %d %s instead of at least %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
else: else:
self.check(num_images == max_strips, self.check(num_images == max_strips, 'saved %d %s instead of %d images in %s' % attrs)
'saved %d %s instead of %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty)
def check_stripurl(self, strip): def check_stripurl(self, strip):
if not self.scraperclass.stripUrl: if not self.scraperclass.stripUrl: