Fix comics.

This commit is contained in:
Bastian Kleineidam 2012-12-04 07:02:40 +01:00
parent 45df462a47
commit 387dff79a9
33 changed files with 372 additions and 241 deletions

4
dosage
View file

@ -98,6 +98,7 @@ def getComics(options, comics):
else:
strips = scraperobj.getCurrentStrips()
first = True
try:
for strip in strips:
_errors, skipped = saveComicStrip(strip, options.basepath)
errors += _errors
@ -107,6 +108,9 @@ def getComics(options, comics):
out.write("Stop retrieval because image file already exists")
break
first = False
except IOError as msg:
out.write('Error getting strip: %s' % msg)
errors += 1
events.getHandler().end()
return errors

View file

@ -1,10 +1,7 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
import re
from .util import fetchUrl, getQueryParams
from .scraper import _BasicScraper
def queryNamer(paramName, usePageUrl=False):
"""Get name from URL query part."""
@ -81,19 +78,3 @@ class IndirectLatestMixin(object):
latestUrl = property(getLatestUrl)
class _PHPScraper(_BasicScraper):
"""
Scraper for comics using phpComic/CUSP.
This provides an easy way to define scrapers for webcomics using phpComic.
"""
imageUrl = property(lambda self: self.basePath + 'daily.php?date=%s')
imageSearch = property(lambda self: re.compile(r'<img alt=[^>]+ src="(%scomics/\d{6}\..+?)">' % (self.basePath,)))
help = 'Index format: yymmdd'
@classmethod
def starter(cls):
"""Get starter URL."""
return cls.basePath + cls.latestUrl

View file

@ -5,8 +5,8 @@
from re import compile
from ..scraper import _BasicScraper
from ..helpers import bounceStarter, indirectStarter
from ..util import tagre, getQueryParams
from ..helpers import bounceStarter
from ..util import tagre
class CaptainSNES(_BasicScraper):
@ -144,37 +144,6 @@ class Curvy(_BasicScraper):
help = 'Index format: yyyymmdd'
def cloneManga(name, shortName, lastStrip=None):
url = 'http://manga.clone-army.org'
baseUrl = '%s/%s.php' % (url, shortName)
def namer(self, imageUrl, pageUrl):
return '%03d' % int(getQueryParams(pageUrl)['page'][0])
attrs = dict(
name='CloneManga/' + name,
stripUrl = baseUrl + '?page=%s',
imageSearch=compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (url, shortName), after="center")),
prevSearch=compile(tagre("a", "href", r'([^"]+)')+tagre("img", "src", r"previous\.gif")),
help='Index format: n',
namer=namer,
)
if lastStrip is None:
attrs['starter'] = indirectStarter(baseUrl, compile(tagre("a", "href", r'([^"]+)')+tagre("img", "src", r"last\.gif")))
else:
attrs['latestUrl'] = attrs['stripUrl'] % lastStrip
return type('CloneManga_%s' % name, (_BasicScraper,), attrs)
anm = cloneManga('AprilAndMay', 'anm')
kanami = cloneManga('Kanami', 'kanami')
momoka = cloneManga('MomokaCorner', 'momoka')
nana = cloneManga('NanasEverydayLife', 'nana', '78')
pxi = cloneManga('PaperEleven', 'pxi', '311')
t42r = cloneManga('Tomoyo42sRoom', 't42r')
penny = cloneManga('PennyTribute', 'penny')
class CatAndGirl(_BasicScraper):
latestUrl = 'http://catandgirl.com/'
stripUrl = latestUrl + '?p=%s'

View file

@ -0,0 +1,61 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
from re import compile
from ..scraper import make_scraper
from ..util import tagre, getQueryParams, fetchUrl
_linkTag = tagre("a", "href", r'([^"]+)')
_prevSearch = compile(_linkTag + tagre("img", "src", r"previous\.gif"))
_nextSearch = compile(_linkTag + tagre("img", "src", r"next\.gif"))
_lastSearch = compile(_linkTag + tagre("img", "src", r"last\.gif"))
def add(name, shortName, imageFolder=None, lastStrip=None):
classname = 'CloneManga_%s' % name
_url = 'http://manga.clone-army.org'
baseUrl = '%s/%s.php' % (_url, shortName)
if imageFolder is None:
imageFolder = shortName
@classmethod
def namer(cls, imageUrl, pageUrl):
return '%03d' % int(getQueryParams(pageUrl)['page'][0])
@classmethod
def _starter(cls):
# first, try hopping to previous and next comic
url = fetchUrl(baseUrl, _prevSearch)
if not url:
# no previous link found, try hopping to last comic
url = fetchUrl(baseUrl, _lastSearch)
if not url:
raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, baseUrl))
return url
url = fetchUrl(url, _nextSearch)
if not url:
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
return url
attrs = dict(
name='CloneManga/' + name,
stripUrl = baseUrl + '?page=%s',
imageSearch=compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (_url, imageFolder), after="center")),
prevSearch=_prevSearch,
help='Index format: n',
namer=namer,
)
if lastStrip is None:
attrs['starter'] = _starter
else:
attrs['latestUrl'] = attrs['stripUrl'] % lastStrip
globals()[classname] = make_scraper(classname, **attrs)
add('AprilAndMay', 'anm', imageFolder='AAM')
add('Kanami', 'kanami')
add('MomokaCorner', 'momoka')
add('NanasEverydayLife', 'nana', lastStrip='78')
add('PaperEleven', 'pxi', imageFolder='papereleven', lastStrip='311')
add('Tomoyo42sRoom', 't42r')
add('PennyTribute', 'penny')

View file

@ -6,25 +6,26 @@ from re import compile
from ..scraper import _BasicScraper
from ..helpers import indirectStarter
from ..util import tagre, getQueryParams
from ..util import tagre
class DMFA(_BasicScraper):
latestUrl = 'http://www.missmab.com/'
stripUrl = latestUrl + 'Comics/Vol_%s.php'
imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"])+')+
tagre("img", "src", r'(?:../)?Images/comicprev.gif'))
multipleImagesPerStrip = True
prevSearch = compile(tagre("a", "href", r'((?:Comics/)?Vol[^"]+)')+
tagre("img", "src", r'(?:../)?Images/comicprev\.gif'))
help = 'Index format: nnn (normally, some specials)'
class DandyAndCompany(_BasicScraper):
latestUrl = 'http://www.dandyandcompany.com/'
stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'([^"]*/strips/[^"]+)'))
prevSearch = compile(r'<a href="(.*)" class="prev"')
help = 'Index format: yyyy/mm/dd'
stripUrl = None
multipleImagesPerStrip = True
imageSearch = compile(tagre("a", "href", r'(http://\d+\.bp\.blogspot\.com/[^"]+)', after="imageanchor"))
prevSearch = compile(tagre("a", "href", r"([^']+)", quote="'", after="Older Posts"))
help = 'Index format: none'
class DarkWings(_BasicScraper):
@ -63,11 +64,11 @@ class DrFun(_BasicScraper):
latestUrl = 'http://www.ibiblio.org/Dave/ar00502.htm'
stripUrl = 'http://www.ibiblio.org/Dave/ar%s.htm'
imageSearch = compile(r'<A HREF= "(Dr-Fun/df\d{6}/df.+?)">')
multipleImagesPerStrip = True
prevSearch = compile(r'<A HREF="(.+?)">Previous Week,')
help = 'Index format: nnnnn'
class Dracula(_BasicScraper):
latestUrl = 'http://draculacomic.net/'
stripUrl = latestUrl + 'comic.php?comicID=%s'
@ -76,7 +77,6 @@ class Dracula(_BasicScraper):
help = 'Index format: nnn'
class DragonTails(_BasicScraper):
latestUrl = 'http://www.dragon-tails.com/'
stripUrl = latestUrl + 'archive.php?date=%s'
@ -96,7 +96,7 @@ class DreamKeepersPrelude(_BasicScraper):
class Drowtales(_BasicScraper):
latestUrl = 'http://www.drowtales.com/mainarchive.php'
stripUrl = latestUrl + '?sid=%s'
imageSearch = compile(tagre("img", "src", r'("http://www.drowtales.com/mainarchive/[^"]+)'))
imageSearch = compile(tagre("img", "src", r'(http://www\.drowtales\.com/mainarchive/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(\?sid=\d+)', before="link_prev_top"))
help = 'Index format: number'
@ -105,7 +105,8 @@ class DieselSweeties(_BasicScraper):
latestUrl = 'http://www.dieselsweeties.com/'
stripUrl = latestUrl + 'archive/%s'
imageSearch = compile(tagre("img", "src", r'(/hstrips/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/archive/\d+)') + tagre("img", "src", r'http://www\.dieselsweeties\.com/ximages/blackbackarrow160.png'))
prevSearch = compile(tagre("a", "href", r'(/archive/\d+)') +
tagre("img", "src", r'(?:http://www\.dieselsweeties\.com/ximages/blackbackarrow160.png|/ximages/prev\.gif)'))
help = 'Index format: n (unpadded)'
@classmethod
@ -118,14 +119,10 @@ class DieselSweeties(_BasicScraper):
class DominicDeegan(_BasicScraper):
latestUrl = 'http://www.dominic-deegan.com/'
stripUrl = latestUrl + 'view.php?date=%s'
imageSearch = compile(r'<img src="(.+?save-as=.+?)" alt')
imageSearch = compile(tagre("img", "src", r'(comics/\d+\.gif)'))
prevSearch = compile(r'"(view.php\?date=.+?)".+?prev21')
help = 'Index format: yyyy-mm-dd'
@classmethod
def namer(cls, imageUrl, pageUrl):
return getQueryParams(imageUrl)['save-as'][0].rsplit('.', 1)[0]
class DorkTower(_BasicScraper):
latestUrl = 'http://www.dorktower.com/'

View file

@ -35,7 +35,7 @@ def add(name):
return url
url = fetchUrl(url, _nextSearch)
if not url:
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, _url))
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
return url
globals()[classname] = make_scraper(classname,

View file

@ -20,16 +20,10 @@ class EerieCuties(_BasicScraper):
class Eriadan(_BasicScraper):
latestUrl = 'http://www.shockdom.com/webcomics/eriadan/'
stripUrl = latestUrl + '%s'
# XXX fix image search
imageSearch = compile(r'title="[^"]+?" src="http://www\.shockdom\.com/eriadan/(wp-content/uploads/.+?)"')
imageSearch = compile(tagre("img", "src", r'(http://www\.shockdom\.com/webcomics/eriadan/files/[^"]+)', after='alt=""'))
prevSearch = compile(tagre("a", "href", r'([^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/nnn (unpadded)'
@classmethod
def namer(cls, imageUrl, pageUrl):
return '%d' % (int(compile(r'p=(\d+)').search(pageUrl).group(1)))
class ElGoonishShive(_BasicScraper):
name = 'KeenSpot/ElGoonishShive'
@ -40,7 +34,6 @@ class ElGoonishShive(_BasicScraper):
help = 'Index format: yyyy-mm-dd'
class ElGoonishShiveNP(_BasicScraper):
name = 'KeenSpot/ElGoonishShiveNP'
latestUrl = 'http://www.egscomics.com/egsnp/'
@ -52,12 +45,10 @@ class ElGoonishShiveNP(_BasicScraper):
class EmergencyExit(_BasicScraper):
latestUrl = 'http://www.eecomics.net/'
stripUrl = None
stripUrl = latestUrl + "?strip_id=%s"
imageSearch = compile(r'"(comics/.+?)"')
prevSearch = compile(r'START.+?"(.+?)"')
# XXX ?
help = 'God help us now!'
prevSearch = compile(tagre("a", "href", r'(\?strip_id=\d+)') + tagre("img", "alt", r"Prior"))
help = 'Index format: n'
class ErrantStory(_BasicScraper):
@ -102,7 +93,7 @@ class EvilInc(_BasicScraper):
class Exiern(_BasicScraper):
latestUrl = 'http://www.exiern.com/'
stripUrl = latestUrl + '?p=%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.exiern\.com/comics/[^"])'))
imageSearch = compile(tagre("img", "src", r'(http://www\.exiern\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.exiern\.com/[^"]+)', after="prev"))
help = 'Index format: n'
@ -154,7 +145,6 @@ class ExploitationNow(_BasicScraper):
class Ellerbisms(_BasicScraper):
latestUrl = 'http://www.ellerbisms.com/'
stripUrl = latestUrl + '?p=%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.ellerbisms\.com/wp-content/uploads/[^"]+)'))
imageSearch = compile(tagre("img", "src", r'(http://www\.ellerbisms\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.ellerbisms\.com/[^"]+)', after="prev"))
help = 'Index format: nnn'

View file

@ -49,12 +49,12 @@ class FlakyPastry(_BasicScraper):
prevSearch = compile(r'<a href="(.+?)".+?btn_back')
help = 'Index format: nnnn'
# XXX move to keenspot
class Flipside(_BasicScraper):
latestUrl = 'http://flipside.keenspot.com/comic.php'
stripUrl = latestUrl + '?i=%s'
imageSearch = compile(r'<IMG SRC="(comic/.+?)"')
prevSearch = compile(r'<A HREF="(comic.php\?i=\d+?)">&lt')
imageSearch = compile(tagre("img", "src", r'(http://cdn\.flipside\.keenspot\.com/comic/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://flipside\.keenspot\.com/comic\.php\?i=\d+)', after="prev"))
help = 'Index format: nnnn'
@ -114,7 +114,8 @@ class FredoAndPidjin(_BasicScraper):
homepage = 'http://www.pidjin.net/'
stripUrl = None
help = 'Index format: yyyy/mm/dd/name'
imageSearch = compile(tagre('img', 'src', '(http://cdn\.pidjin\.net/wp-content/uploads/\d\d\d\d/\d\d/\d+[^"]+\.png)'))
imageSearch = compile(tagre('img', 'src', '(http://cdn\.pidjin\.net/wp-content/uploads/\d+/\d+/[^"]+\.png)'))
multipleImagesPerStrip = True
prevSearch = compile(tagre('a', 'href', '([^"]+)')+"Prev</a>")
starter = indirectStarter(homepage,
compile(tagre('a', 'href', "("+homepage+r'\d\d\d\d/\d\d/\d\d/[^"]+/)')))

View file

@ -12,7 +12,7 @@ _prevSearch = compile(r' <a href="(http://www\.thefallenangel\.co\.uk/.+?)"><img
def add(name, shortname):
latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
classname = asciify(name)
classname = "FallenAngel_" + asciify(name)
globals()[classname] = make_scraper(classname,
latestUrl = latestUrl,
stripUrl = latestUrl + '?date=%s',

View file

@ -34,17 +34,9 @@ class GUComics(_BasicScraper):
help = 'Index format: yyyymmdd'
class GenrezvousPoint(_BasicScraper):
latestUrl = 'http://www.genrezvouspoint.com/'
stripUrl = latestUrl + 'index.php?comicID=%s'
imageSearch = compile(r'<img src=\'(comics/.+?)\'')
prevSearch = compile(r' <a[^>]+?href="(.+?)">PREVIOUS</a>')
help = 'Index format: nnn'
class GirlGenius(_BasicScraper):
latestUrl = 'http://girlgeniusonline.com/comic.php'
stripUrl = 'http://www.girlgeniusonline.com/comic.php?date=%s'
stripUrl = latestUrl + '?date=%s'
imageSearch = compile(r"(/ggmain/strips/.+?)'")
prevSearch = compile(r"</a> <a href=.+?(/comic.php\?date=.+?)'.+?Previous")
help = 'Index format: yyyymmdd'
@ -99,7 +91,8 @@ class Gunshow(_BasicScraper):
latestUrl = 'http://gunshowcomic.com/'
stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://gunshowcomic\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]+)') + tagre("img", "src", r'[^"]+menu/small/previous\.gif'))
multipleImagesPerStrip = True
prevSearch = compile(tagre("a", "href", r'([^"]+)') + tagre("img", "src", r'[^"]*menu/small/previous\.gif'))
help = 'Index format: n'
@ -131,7 +124,7 @@ class GlassHalfEmpty(_BasicScraper):
latestUrl = 'http://www.defectivity.com/ghe/index.php'
stripUrl = latestUrl + '?strip_id=%s'
imageSearch = compile(r'src="(comics/.+?)"')
prevSearch = compile(r'</a><a href="(.+?)"><img src="\.\./images/onback\.jpg"')
prevSearch = compile(tagre("a", "href", r'(\?strip_id=\d+)') + tagre("img", "src", r'\.\./images/arrowbuttons/onback\.jpg'))
help = 'Index format: nnn'

View file

@ -11,12 +11,3 @@ class HorribleVille(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/d/[^"]+)') + tagre("img", "src", r'/images/previous\.png'))
help = 'Index format: yyyymmdd'
class HelpDesk(_BasicScraper):
latestUrl = 'https://www.eviscerati.org/comics?page=78'
stripUrl = 'https://www.eviscerati.org/comics?page=%s'
imageSearch = compile(tagre("img", "src", r'(https://www\.eviscerati\.org/files/comics/[^"]+)'))
prevSearch = compile(tagre("li", "class", r'pager-previous[^"]+') + tagre("a", "href", r'(/comics\?page=%d+)'))
help = 'Index format: n'

View file

@ -9,9 +9,9 @@ from ..util import tagre
class IDreamOfAJeanieBottle(_BasicScraper):
latestUrl = 'http://jeaniebottle.com/'
stripUrl = latestUrl + 'review.php?comicID='
stripUrl = latestUrl + '?p=%s'
imageSearch = compile(r'(/comics/.+?)"')
prevSearch = compile(r'First".+?(review.php.+?)".+?prev_a.gif')
prevSearch = compile(tagre("a", "href", r'(http://jeaniebottle\.com/\?p=\d+)', after="prev"))
help = 'Index format: n (unpadded)'

View file

@ -2,7 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
from re import compile, MULTILINE
from re import compile
from ..scraper import _BasicScraper
from ..util import tagre
@ -21,12 +21,3 @@ class JoeAndMonkey(_BasicScraper):
imageSearch = compile(r'"(/comic/[^"]+)"')
prevSearch = compile(r"<a href='(/\d+)'>Previous")
help = 'Index format: nnn'
class JoyOfTech(_BasicScraper):
latestUrl = 'http://www.geekculture.com/joyoftech/'
stripUrl = latestUrl + 'joyarchives/%s.html'
imageSearch = compile(tagre("img", "src", r'(joyimages/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(joyarchives/[^"]+)') + r'.+?Previous', MULTILINE)
help = 'Index format: nnn'

View file

@ -4,7 +4,7 @@
from re import compile, IGNORECASE
from ..scraper import _BasicScraper
from ..util import tagre
class Key(_BasicScraper):
latestUrl = 'http://key.shadilyn.com/latestpage.html'
@ -25,7 +25,7 @@ class Krakow(_BasicScraper):
class Kukuburi(_BasicScraper):
latestUrl = 'http://www.kukuburi.com/current/'
stripUrl = 'http://www.kukuburi.com/v2/%s/'
imageSearch = compile(r'img src="(http://www.kukuburi.com/../comics/.+?)"')
imageSearch = compile(tagre("img", "src", r'(http://www\.kukuburi\.com/v2/comics/[^"]+)', after='alt="[^"]'))
prevSearch = compile(r'nav-previous.+?"(http.+?)"')
help = 'Index format: yyyy/mm/dd/stripname'

View file

@ -16,16 +16,6 @@ class LasLindas(_BasicScraper):
help = 'Index format: stripname'
class LesbianPiratesFromOuterSpace(_BasicScraper):
latestUrl = 'http://rosalarian.com/lesbianpirates/'
stripUrl = latestUrl + 'index.php?p=%s'
imageSearch = compile(tagre("img", "src", r'("comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/index\.php\?id=\d+)', after="prev"))
help = 'Index format: n'
class Lint(_BasicScraper):
latestUrl = 'http://www.purnicellin.com/lint/'
stripUrl = latestUrl + '%s'
@ -34,7 +24,6 @@ class Lint(_BasicScraper):
help = 'Index format: yyyy/mm/dd/num-name'
class LookingForGroup(_BasicScraper):
latestUrl = 'http://www.lfgcomic.com/page/latest'
stripUrl = 'http://www.lfgcomic.com/page/%s'
@ -51,8 +40,8 @@ class LookingForGroup(_BasicScraper):
class LittleGamers(_BasicScraper):
latestUrl = 'http://www.little-gamers.com/'
stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://www\.little-gamers\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www.little-gamers.com/[^"]+)', before="comic-nav-prev-link"))
imageSearch = compile(tagre("img", "src", r'(http://little-gamers\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.little-gamers.com/[^"]+)', before="comic-nav-prev-link"))
help = 'Index format: yyyy/mm/dd/name'

View file

@ -67,8 +67,8 @@ class Melonpool(_BasicScraper):
class Misfile(_BasicScraper):
latestUrl = 'http://www.misfile.com/'
stripUrl = latestUrl + '?date=%s'
imageSearch = compile(tagre("img", "src", r'(comics/[^"]+)'))
prevSearch = compile(tagre("link", "href", r'([^"]+)', before="Previous"))
imageSearch = compile(tagre("img", "src", r"(comics/[^']+)", quote="'"))
prevSearch = compile(tagre("link", "href", r"([^']+)", quote="'", before="Previous"))
help = 'Index format: yyyy-mm-dd'
@ -76,15 +76,6 @@ class MysteriesOfTheArcana(_BasicScraper):
latestUrl = 'http://mysteriesofthearcana.com/'
stripUrl = latestUrl + 'index.php?action=comics&cid=%s'
imageSearch = compile(tagre("img", "src", r'(image\.php\?type=com&i=[^"]+)'))
prevSearch = compile(tagre("a", "href", r'()', after="navprevius"))
prevSearch = compile(tagre("a", "href", r'(index\.php[^"]+)', after="navprevious"))
help = 'Index format: n (unpadded)'
# XXX move to keenspot?
class MysticRevolution(_BasicScraper):
latestUrl = 'http://mysticrevolution.keenspot.com/'
stripUrl = latestUrl + '?cid=%s'
imageSearch = compile(tagre("img", "src", r'(http://cdn\.mysticrevolution\.keenspot\.com/comics/[^"]+)'))
prevSearch = compile(tagre("link", "rel", r'(\?cid=\d+)', before="prev"))
help = 'Index format: n (unpadded)'

View file

@ -4,14 +4,14 @@
from re import compile
from ..scraper import _BasicScraper
from ..helpers import indirectStarter, _PHPScraper
from ..helpers import indirectStarter, bounceStarter
from ..util import tagre
class NamirDeiter(_BasicScraper):
latestUrl = 'http://www.namirdeiter.com/'
stripUrl = latestUrl + 'comics/index.php?date=%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.namirdeiter\.com/comics/\d\.jpg)', quote=""))
imageSearch = compile(tagre("img", "src", r"'?(http://www\.namirdeiter\.com/comics/\d+\.jpg)'?", quote=""))
prevSearch = compile(tagre("a", "href", r'(http://www\.namirdeiter\.com/comics/index\.php\?date=\d+)', quote="'")+"Previous")
help = 'Index format: yyyymmdd'
@ -63,17 +63,19 @@ class Nukees(_BasicScraper):
help = 'Index format: yyyymmdd.html'
class NekoTheKitty(_PHPScraper):
basePath = 'http://www.nekothekitty.net/cusp/'
latestUrl = basePath
prevSearch = compile(tagre("a", "href", r'(http://www.nekothekitty.net/comics/[^"]+)') +
class NekoTheKitty(_BasicScraper):
basePath = 'http://www.nekothekitty.net/'
stripUrl = basePath + 'comics/%s'
starter = bounceStarter(basePath, compile(tagre("a", "href", r'(http://www\.nekothekitty\.net/comics/[^"]+)') +
tagre("img", "src", r'http://www\.nekothekitty\.net/files/smallnext.png')))
imageSearch = compile(tagre("img", "src", r'(http://(?:img\d+|www)\.smackjeeves\.com/images/uploaded/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.nekothekitty\.net/comics/[^"]+)') +
tagre("img", "src", r'http://www\.nekothekitty\.net/files/smallprev.png'))
help = 'Index format: n/n-name'
class NichtLustig(_BasicScraper):
stripUrl = 'http://www.nichtlustig.de/toondb/%s.html'
stripUrl = 'http://static.nichtlustig.de/toondb/%s.html'
imageSearch = compile('background-image:url\((http://static\.nichtlustig\.de/comics/full/\d+\.jpg)')
prevSearch = compile(tagre("a", "href", r'(http://static\.nichtlustig\.de/toondb/\d+\.html)'))
help = 'Index format: yymmdd'
@ -101,6 +103,7 @@ class NekkoAndJoruba(_BasicScraper):
class NobodyScores(_BasicScraper):
latestUrl = 'http://nobodyscores.loosenutstudio.com/'
stripUrl = latestUrl + 'index.php?id=%s'
imageSearch = compile(r'><img src="(http://nobodyscores\.loosenutstudio\.com/comix/.+?)"')
imageSearch = compile(tagre("img", "src", r'(http://nobodyscores\.loosenutstudio\.com/comix/[^"]+)'))
multipleImagesPerStrip = True
prevSearch = compile(r'<a href="(http://nobodyscores\.loosenutstudio\.com/index.php.+?)">the one before </a>')
help = 'Index format: nnn'

View file

@ -28,7 +28,7 @@ class OddFish(_BasicScraper):
class OnTheEdge(_BasicScraper):
latestUrl = 'http://ontheedgecomics.com/'
stripUrl = 'http://ontheedgecomics.com/comic/ote%s'
stripUrl = 'http://ontheedgecomics.com/comic/%s'
imageSearch = compile(r'<img src="(http://ontheedgecomics.com/comics/.+?)"')
prevSearch = compile(r'<a href="([^"]+)" rel="prev">')
help = 'Index format: nnn (unpadded)'

View file

@ -12,11 +12,10 @@ class PartiallyClips(_BasicScraper):
latestUrl = 'http://partiallyclips.com/'
stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://partiallyclips\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://partiallyclips\.com/[^"]+)', before="prev"))
prevSearch = compile(tagre("a", "href", r'(http://partiallyclips\.com/[^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/stripname'
class PastelDefender(_BasicScraper):
latestUrl = 'http://www.pasteldefender.com/coverbackcover.html'
stripUrl = 'http://www.pasteldefender.com/%s.html'
@ -25,7 +24,6 @@ class PastelDefender(_BasicScraper):
help = 'Index format: nnn'
class PebbleVersion(_BasicScraper):
latestUrl = 'http://www.pebbleversion.com/'
stripUrl = latestUrl + 'Archives/Strip%s.html'
@ -37,7 +35,7 @@ class PebbleVersion(_BasicScraper):
class PennyAndAggie(_BasicScraper):
baseUrl = 'http://www.pennyandaggie.com/'
stripUrl = baseUrl + 'index.php?p=%s'
imageSearch = compile(tagre("a", "href", r'(http://www\.pennyandaggie\.com/comics/[^"]+)'))
imageSearch = compile(tagre("img", "src", r'(http://www\.pennyandaggie\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r"(index\.php\?p=\d+)", quote="'") +
tagre("img", "src", r'http://pennyandaggie\.com/images/previous_day\.gif', quote=""))
starter = indirectStarter(baseUrl, prevSearch)
@ -47,20 +45,19 @@ class PennyAndAggie(_BasicScraper):
class PennyArcade(_BasicScraper):
baseUrl = 'http://penny-arcade.com/comic/'
starter = bounceStarter(baseUrl,
compile(tagre("a", "href", r'(http://penny-arcade\.com/comic/[^"]+)', before="bntNext"))
compile(tagre("a", "href", r'(http://penny-arcade\.com/comic/[^"]+)', before="btnNext"))
)
stripUrl = baseUrl + '%s/'
stripUrl = baseUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://art\.penny-arcade\.com/photos/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://penny-arcade\.com/comic/[^"]+)', before="bntPrev"))
prevSearch = compile(tagre("a", "href", r'(http://penny-arcade\.com/comic/[^"]+)', before="btnPrev"))
help = 'Index format: yyyy/mm/dd'
@classmethod
def namer(cls, imageUrl, pageUrl):
yyyy, mm, dd = pageUrl.split('/')[-4:-1]
dummy, yyyy, mm, dd = pageUrl.rsplit('/', 3)
return '%04d%02d%02d' % (int(yyyy), int(mm), int(dd))
class PeppermintSaga(_BasicScraper):
latestUrl = 'http://www.pepsaga.com/'
stripUrl = latestUrl + '?p=%s'
@ -101,7 +98,7 @@ class Precocious(_BasicScraper):
class PvPonline(_BasicScraper):
latestUrl = 'http://pvponline.com/comic'
stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://newcdn\.pvponline\.com/img/comic/pvp\d+\.jpg)'))
imageSearch = compile(tagre("img", "src", r'(http://newcdn\.pvponline\.com/img/comic/pvp[^"]+\.jpg)'))
prevSearch = compile(tagre("a", "href", r'(http://pvponline\.com/comic/[^"]+)', after="Previous"))
help = 'Index format: yyyy/mm/dd/stripname'
@ -135,7 +132,7 @@ evilish = pensAndTales('Evilish', 'http://evilish.pensandtales.com/')
class ProperBarn(_BasicScraper):
latestUrl = 'http://www.nitrocosm.com/go/gag/'
stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://content\.nitrocosm\.com/gag/\d+.png)'))
imageSearch = compile(tagre("img", "src", r'(http://content\.nitrocosm\.com/gag/\d+\.[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.nitrocosm\.com/go/gag/\d+/)', after="nav_btn_previous"))
help = 'Index format: nnn'

View file

@ -19,7 +19,7 @@ class RadioactivePanda(_BasicScraper):
# XXX add other comics at http://petitesymphony.com/comics/
class Rascals(_BasicScraper):
latestUrl = 'http://rascals.petitesymphony.com/'
stripUrl = latestUrl + '/comic/rascals-pg-%s/'
stripUrl = latestUrl + 'comic/rascals-pg-%s/'
imageSearch = compile(tagre("img", "src", r'(http://rascals\.petitesymphony\.com/files/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://rascals\.petitesymphony\.com/comic/[^"]+)', after="Previous"))
help = 'Index format: num'
@ -36,7 +36,7 @@ class RealLife(_BasicScraper):
class RedString(_BasicScraper):
latestUrl = 'http://www.redstring.strawberrycomics.com/'
stripUrl = latestUrl + 'index.php?id=%s'
imageSearch = compile(tagre("img", "src", r'("comics/[^"]+)'))
imageSearch = compile(tagre("img", "src", r'(comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/index\.php\?id=\d+)', after="prev"))
help = 'Index format: nnn'

View file

@ -10,7 +10,7 @@ from ..util import tagre
class SailorsunOrg(_BasicScraper):
latestUrl = 'http://www.sailorsun.org/'
latestUrl = 'http://sailorsun.org/'
stripUrl = latestUrl + '?p=%s'
imageSearch = compile(tagre("img", "src", r'(http://sailorsun\.org/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://sailorsun\.org/\?p=\d+)', after="prev"))
@ -28,7 +28,7 @@ class SamAndFuzzy(_BasicScraper):
class SarahZero(_BasicScraper):
latestUrl = 'http://www.sarahzero.com/'
stripUrl = latestUrl + 'sz_%s.html'
imageSearch = compile(tagre("img", "src", r'(z_spreads/sz_[^"]+)'))
imageSearch = compile(tagre("img", "src", r'(z_(?:spreads|decoy)/sz_[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(sz_\d+\.html)') + tagre("img", "src", r'z_site/sz_05_nav\.gif'))
help = 'Index format: nnnn'
@ -45,7 +45,8 @@ class SchlockMercenary(_BasicScraper):
latestUrl = 'http://www.schlockmercenary.com/'
stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://static\.schlockmercenary\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/d+)', after="nav-previous"))
multipleImagesPerStrip = True
prevSearch = compile(tagre("a", "href", r'(/\d+-\d+-\d+)', quote="'", after="nav-previous"))
help = 'Index format: yyyy-mm-dd'
@ -102,7 +103,7 @@ class SluggyFreelance(_BasicScraper):
class SodiumEyes(_BasicScraper):
latestUrl = 'http://sodiumeyes.com/'
stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://sodiumeyes\.com/comic/[^"]+)'))
imageSearch = compile(tagre("img", "src", r'(http://sodiumeyes\.com/comic/[^ ]+)', quote=""))
prevSearch = compile(tagre("a", "href", r'(http://sodiumeyes\.com/[^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/stripname'
@ -110,9 +111,9 @@ class SodiumEyes(_BasicScraper):
class SpareParts(_BasicScraper):
baseUrl = 'http://www.sparepartscomics.com/'
latestUrl = baseUrl + 'comics/?date=20080328'
stripUrl = baseUrl + 'comics/?date=s%'
imageSearch = compile(tagre("img", "src", r'http://www\.sparepartscomics\.com/comics/[^"]+'))
prevSearch = compile(tagre("a", "href", r'(index\.php\?date=\d+)') + "Previous Comic")
stripUrl = baseUrl + 'comics/index.php?date=%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.sparepartscomics\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(index\.php\?date=\d+)', quote="'") + "Previous Comic")
help = 'Index format: yyyymmdd'
@ -127,7 +128,7 @@ class Stubble(_BasicScraper):
class StrawberryDeathCake(_BasicScraper):
latestUrl = 'http://strawberrydeathcake.com/'
stripUrl = latestUrl + 'archive/%s/'
imageSearch = compile(tagre("img", "src", r'http://strawberrydeathcake\.com/wp-content/webcomic/[^"]+'))
imageSearch = compile(tagre("img", "src", r'(http://strawberrydeathcake\.com/wp-content/webcomic/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://strawberrydeathcake\.com/archive/[^"]+)', after="previous"))
help = 'Index format: stripname'
@ -144,7 +145,8 @@ class SomethingPositive(_BasicScraper):
latestUrl = 'http://www.somethingpositive.net/'
stripUrl = latestUrl + 'sp%s.shtml'
imageSearch = compile(tagre("img", "src", r'(sp\d+\.png)'))
prevSearch = compile(tagre("a", "href", r'(sp\d+\.shtml)') + "Previous")
prevSearch = compile(tagre("a", "href", r'(sp\d+\.shtml)') +
"(?:" + tagre("img", "src", r'images/previous\.gif') + "|Previous)")
help = 'Index format: mmddyyyy'
@classmethod
@ -152,7 +154,6 @@ class SomethingPositive(_BasicScraper):
return pageUrl.split('/')[-1].split('.')[0]
class SexyLosers(_BasicScraper):
stripUrl = 'http://www.sexylosers.com/%s.html'
imageSearch = compile(r'<img src\s*=\s*"\s*(comics/[\w\.]+?)"', IGNORECASE)
@ -172,7 +173,7 @@ class SexyLosers(_BasicScraper):
class StarCrossdDestiny(_BasicScraper):
latestUrl = 'http://www.starcrossd.net/comic.html'
stripUrl = 'http://www.starcrossd.net/archives/%s.html'
imageSearch = compile(r'<img src="(http://www\.starcrossd\.net/(?:ch1|strips|book2)/[^"]+)">')
imageSearch = compile(tagre("img", "src", r'(http://www\.starcrossd\.net/(?:ch1|strips|book2)/[^"]+)'))
prevSearch = compile(r'<a href="(http://www\.starcrossd\.net/(?:ch1/)?archives/\d+\.html)"[^>]*"[^"]*"[^>]*>prev', IGNORECASE)
help = 'Index format: nnnnnnnn'
@ -212,11 +213,3 @@ class SMBC(_BasicScraper):
prevSearch = compile(r'131,13,216,84"\n\s+href="(.+?)#comic"\n>', MULTILINE)
help = 'Index format: nnnn'
class SomethingLikeLife(_BasicScraper):
latestUrl = 'http://www.pulledpunches.com/'
stripUrl = latestUrl + '?p=%s'
imageSearch = compile(r'<img src="(http://www.pulledpunches.com/comics/[^"]*)"')
prevSearch = compile(r'</a> <a href="(http://www.pulledpunches.com/\?p=[^"]*)"><img src="back1.gif"')
help = 'Index format: nn'

View file

@ -10,7 +10,7 @@ from ..util import tagre
class TheNoob(_BasicScraper):
latestUrl = 'http://www.thenoobcomic.com/index.php'
stripUrl = latestUrl + '?pos=%'
stripUrl = latestUrl + '?pos=%s'
imageSearch = compile(tagre("img", "src", r'(/headquarters/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(\?pos=\d+)', before="comic_nav_previous_button"))
help = 'Index format: nnnn'
@ -19,7 +19,7 @@ class TheNoob(_BasicScraper):
class TheOrderOfTheStick(_BasicScraper):
latestUrl = 'http://www.giantitp.com/comics/oots0863.html'
stripUrl = latestUrl + 'comics/oots%s.html'
stripUrl = 'http://www.giantitp.com/comics/oots%s.html'
imageSearch = compile(r'<IMG src="(/comics/images/.+?)">')
prevSearch = compile(r'<A href="(/comics/oots\d{4}\.html)"><IMG src="/Images/redesign/ComicNav_Back.gif"')
help = 'Index format: n (unpadded)'
@ -31,7 +31,8 @@ class TheParkingLotIsFull(_BasicScraper):
latestUrl = 'http://plif.courageunfettered.com/archive/arch2002.htm'
stripUrl = 'http://plif.courageunfettered.com/archive/arch%s.htm'
imageSearch = compile(r'<td align="center"><A TARGET=_parent HREF="(wc\d+\..+?)">')
prevSearch = compile(r'-\s*\n\s*<A HREF="(arch\d{4}\.htm)">\d{4}</A>')
multipleImagesPerStrip = True
prevSearch = compile(r'\d{4} -\s+<A HREF="(arch\d{4}\.htm)">\d{4}')
help = 'Index format: nnn'
@ -40,7 +41,7 @@ class TheWotch(_BasicScraper):
latestUrl = 'http://www.thewotch.com/'
stripUrl = latestUrl + '?date=%s'
imageSearch = compile(r"<img.+?src='(comics/.+?)'")
prevSearch = compile(r"<link rel='Previous' href='(\?date=\d+-\d+-\d+)'")
prevSearch = compile(r"<link rel='Previous' href='(/\?date=\d+-\d+-\d+)'")
help = 'Index format: yyyy-mm-dd'

View file

@ -6,12 +6,12 @@ from re import compile
from ..scraper import _BasicScraper
from ..helpers import bounceStarter, indirectStarter
from ..util import getQueryParams
from ..util import getQueryParams, tagre
class Undertow(_BasicScraper):
stripUrl = 'http://undertow.dreamshards.org/%s'
imageSearch = compile(r'<img src="(.+?)"')
imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)'))
prevSearch = compile(r'href="(.+?)".+?teynpoint')
help = 'Index format: good luck !'
starter = indirectStarter('http://undertow.dreamshards.org/',

View file

@ -36,6 +36,7 @@ class WhyTheLongFace(_BasicScraper):
latestUrl = 'http://www.absurdnotions.org/wtlf200709.html'
stripUrl = 'http://www.absurdnotions.org/wtlf%s.html'
imageSearch = compile(r'<img src="(http://www.absurdnotions.org/wtlf.+?|lf\d+.\w{1,4})"', IGNORECASE)
multipleImagesPerStrip = True
prevSearch = compile(r'HREF="(.+?)"><IMG SRC="nprev.gif" ')
help = 'Index format: yyyymm'
@ -66,7 +67,7 @@ class WorldOfWarcraftEh(_BasicScraper):
class Wulffmorgenthaler(_BasicScraper):
latestUrl = 'http://wumocomicstrip.com/'
stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(/img/strip/thumb/[^"]+)'))
imageSearch = compile(tagre("img", "src", r'(/img/strip/[^/"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]+)') + "<span>Previous")
help = 'Index format: yyyy/mm/dd'

View file

@ -2,14 +2,15 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
from re import compile, IGNORECASE
from re import compile
from ..util import tagre
from ..scraper import make_scraper
from ..helpers import bounceStarter
_imageSearch = compile(r'SRC="(http://www\.wlpcomics\.com/adult/.+?|http://www\.wlpcomics\.com/general/.+?)"', IGNORECASE)
_prevSearch = compile(r'</a> <A HREF="(\w+.html)">Previous Page</a>', IGNORECASE)
_nextSearch = compile(r'</a> <A HREF="(\w+.html)">Next Page</a>', IGNORECASE)
_imageSearch = compile(tagre("img", "src", r'(http://www\.wlpcomics\.com/(?:adult|general)/[^"]+)'))
_prevSearch = compile(tagre("a", "href", r'(\w+.html)') + 'Previous')
_nextSearch = compile(tagre("a", "href", r'(\w+.html)') + 'Next')
def add(name, path):
@ -35,4 +36,3 @@ add('ChichiChan', 'adult/chichi/')
add('ChocolateMilkMaid', 'adult/cm/')
add('MaidAttack', 'general/maidattack/')
add('ShadowChasers', 'general/shadowchasers/')
add('Stellar', 'adult/stellar/')

View file

@ -19,8 +19,8 @@ class YAFGC(_BasicScraper):
class YouSayItFirst(_BasicScraper):
latestUrl = 'http://www.yousayitfirst.com/'
stripUrl = latestUrl + 'comics/index.php?date=%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.yousayitfirst\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.yousayitfirst\.com/comics/index\.php\?date=\d+)') + "Previous")
imageSearch = compile(tagre("img", "src", r"(http://www\.yousayitfirst\.com/comics/[^>']+)", quote="'?"))
prevSearch = compile(tagre("a", "href", r'(http://www\.yousayitfirst\.com/comics/index\.php\?date=\d+)', quote="'") + "Previous")
help = 'Index format: yyyymmdd'

View file

@ -17,6 +17,11 @@ class Zapiro(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'(http://mg\.co\.za/cartoon/[^"]+)')+"Older")
help = 'Index format: yyyy-mm-dd-stripname'
@classmethod
def namer(cls, imageUrl, pageUrl):
name = imageUrl.split('/')[-3]
return name
class ZombieHunters(_BasicScraper):
latestUrl = 'http://www.thezombiehunters.com/'

View file

@ -78,6 +78,7 @@ class _BasicScraper(object):
while url:
imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
prevUrl = self.prevUrlModifier(prevUrl)
out.write("Matched previous URL %s" % prevUrl, 2)
seen_urls.add(url)
yield self.getComicStrip(url, imageUrls)
# avoid recursive URL loops

View file

@ -163,7 +163,7 @@ def normaliseURL(url):
pu = list(urlparse.urlparse(url))
segments = pu[2].split('/')
while segments and segments[0] == '':
while segments and segments[0] in ('', '..'):
del segments[0]
pu[2] = '/' + '/'.join(segments).replace(' ', '%20')
# remove leading '&' from query

View file

@ -16,14 +16,137 @@ json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
"Twonks_and_Plonkers", # broken images, no real content
"U_Chuu_No_Hoshi_Hotoshi_Tsuko", # broken images
"Red_Dog_Venue", # start page is broken
"Monster_Lover", # start page is broken
"Legacy_of_Blaze", # broken images
"Dead_Strangers", # broken images
"Crack", # broken images
"Iron_Wolf", # broken images
"A_Call_to_Destiny__NC_17", # start page requires login
"A_Call_to_Destiny_Reloaded", # start page requires login
"A_Day_in_the_Life_for_Erik", # broken images
"A_Fairly_Twisted_Reality", # start page requires login
"Al_and_Scout", # broken images
"ANGELOU_____Las_aventuras_de_Nikole", # broken images
"Apartment_408_Full_Size", # broken images
"Apple_Valley", # broken images
"Apt_408_Minis", # broken images
"atxs", # broken images
"A_Word_Of_Wisdom", # broken images
"Brathalla", # broken images
"Binary_Souls_Other_Dimensions", # broken images
"BK_Shattered_Hate", # broken images
"Chomp", # broken images
"Chu_and_Kenny", # broken images
"Coga_Suro_2", # broken images
"Creepy_Girl_and_Her_Zombie_Dog", # broken images
"CuoreVoodoo", # broken images
"dairyaire", # broken images
"DIS", # broken images
"Dot_TXT", # broken images
"Dreadnought_Invasion_Six", # broken images
"Emerald_Winter", # broken images
"Enter_the_Duck_2", # broken images
"ffff", # broken images
"Function_Over_Fashion", # broken images
"Funday_Morning", # broken images
"greys_journey", # broken images
"Head_over_Heart", # broken images
"Hurrocks_Fardel", # broken images
"Bhaddland", # start page requires login
"Bouncing_Orbs_of_Beauty", # start page requires login
"Busty_Solar", # start page requires login
"Illusional_Beauty", # broken images
"Indigo_Bunting__Vampire", # start page requires login
"Irrumator", # start page requires login
"Its_A_Boy_Thing", # start page requires login
"Kokuahiru_comics", # start page requires login
"Inside_OuT", # broken images
"Journey_to_Raifina", # broken images
"KALA_dan", # broken images
"Live_to_tell", # start page requires login
"Locoma", # broken images
"London_Underworld", # broken images
"Louder_Than_Bombs", # broken images
"Lucky_Dawg", # broken images
"Mario_in_Johto", # broken images
"Master", # start page requires login
"Mastermind_BTRN", # broken images
"MAYA_____The_legend_of_Wolf", # broken images
"Megaman_Zero", # broken images
"Monster_Lover_Destinys_Path", # start page requires login
"M_Organ_Art", # start page requires login
"Morning_Squirtz", # start page requires login
"MOSAIC", # broken images
"My_Angel_and_My_Devil", # broken images
"Nemution_Jewel", # start page requires login
"Nemution_Redux", # start page requires login
"New_Pages", # broken images
"Ninja_Shizatch", # broken images
"Normalcy_is_for_Wimps", # broken images
"MIKYAGU", # broken images
"One_Third_Of_Your_Life_Is_Spent_Sleeping_One_Third_Of_Your_Life_Is_Spent_Working_And_Half_Of_One_Third_Is_Spent_Waiting_The_Question_Is_It_Really_Your_Life", # broken images
"OTENBA_Files", # start page requires login
"Panacea", # start page requires login
"Parker_Lot", # broken images
"Peter_And_The_Wolf", # start page requires login
"Perspectives", # broken images
"Pokemon_Sinnoh_Surfer", # broken images
"Pokemon_World_Trainers", # broken images
"Potpourri_of_Lascivious_Whimsy", # start page requires login
"Pr0nCrest", # start page requires login
"punished_girls", # start page requires login
"Powerjeff", # broken images
"Comicarotica", # start page requires login
"Dark_Sisters", # start page requires login
"Death_P0rn", # start page requires login
"Dreams_in_Synergy", # broken images
"GNight_Shade", # start page requires login
"GRIND", # start page requires login
"HUSS", # start page requires login
"Red_Dog_Venue", # start page is broken
"rubber_girls", # start page requires login
"Robomeks", # broken images
"Robot_Friday", # broken images
"SFA", # start page requires login
"Shadow_Root", # start page requires login
"Shiro_Karasu", # start page requires login
"Shelter_of_Wings", # broken images
"Some_Notes", # broken images
"Sonic_Advanced_Online", # broken images
"Sonic_and_tails_corner", # broken images
"Sonic_Unreal", # broken images
"Tales_of_Schlock", # start page requires login
"Splices_of_Life", # broken images
"STARSEARCHERS", # broken images
"Ted_The_Terrible_Superhero", # broken images
"Terra_online_comic", # broken images
"The_Auragon_Base", # broken images
"The_Bend", # broken images
"The_Chronicles_of_Drew", # broken images
"The_Devils_Horn", # broken images
"The_Dragon_and_the_Lemur", # start page requires login
"The_Fighting_Stranger", # broken images
"The_Mighty_Omega", # broken images
"The_Misadventures_of_Everyone", # start page requires login
"The_NEW_Life_Of_TimmY", # broken images
"The_SSA", # broken images
"Tony_The_Hedgehog", # broken images
"Trapped_in_a_Comic", # start page requires login
"Unsound_of_Mind", # broken images
"Vampire_Chronicles__Dark_Lust", # start page requires login
"WarMage", # start page requires login
"Watashi_No_Ame", # broken images
"Weave", # broken images
"Weirdlings", # template error
"Welcome_To_Border_City", # broken images
"what_comes_first", # start page requires login
"Within_Shadows", # broken images
"Xolta", # start page requires login
"XTIN__The_Dragons_Dream_World", # start page requires login
"X_UP", # start page requires login
"Zandars_Saga", # start page requires login
"Twonks_and_Plonkers", # broken images, no real content
"U_Chuu_No_Hoshi_Hotoshi_Tsuko", # broken images
]

View file

@ -21,6 +21,22 @@ url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r
# names of comics to exclude
exclude_comics = [
"FrikkFrakkAndFrank", # too few comics
"Apocalypseharry", # too few comics
"BatkidandBatrat", # too few comics
"BETWEENTHELINES", # comic unavailable
"Bonner", # missing page
"Buster", # comic unavailabe
"DALTONDOG", # comic unavailable
"DellAndSteve", # too few comics
"Dilbert", # redirect
"InkeeDoodles", # comic unavailable
"MaggiesComics", # too few comics
"OfMiceandMud", # too few comics
"OysterWar", # too few comics
"PIGTIMES", # comic unavailable
"PS", # comic unavailable
"SherpaAid", # comic unavailable
"SparComics", # comic unavailable
]

View file

@ -21,27 +21,59 @@ num_matcher = re.compile(r'Number of Days: (\d+)')
# names of comics to exclude
exclude_comics = [
"10", # page is gone
"54sinRed", # page is 403 forbidden
"6D4", # redirected to another page
"AaaSoCAwesomenessandaSliceofCheese", # broken images
"AcrossthePond", # page moved
"ACDeceptibotscomic", # no images
"AdamandSei", # page has 403 forbidden
"AdamsRoadGang", # page is gone
"ADVENTURERS", # page is gone
"AiYaiYai", # page moved
"AlltheCommies", # missing images
"AltaModaMetro", # page redirected
"AltarGirl", # page redirected
"Amerika", # no images
"Angels", # page has 403 forbidden
"AngryDMonkey", # page redirected
"Angst", # page redirected
"Animenifesto", # too few images
"Anna", # no images
"Arcana", # archive broken
"Area15", # no images
"BaidheTu", # no images
"BasilFlint", # page redirected
"beerkada", # no images
"BelovedLeader", # broken images
"BigMouthComics", # page does not follow standard layout
"", # page is gone
"", # page is gone
"", # page is gone
"BlueZombie", # broken page
"BoomerExpress", # redirection to another page
"DungeonDamage", # page does not follow standard layout
"EarthRiser", # redirects to a new page
"FaultyLogic", # page does not follow standard layout
"GoForIt", # page is gone
"JuvenileDiversion", # page moved
"JustWeird", # page has 403 forbidden
"Michikomonogatari", # page does not follow standard layout
"MobileMadness", # page does not follow standard layout
"KnightsOfTheNexus", # page does not follow standard layout
"RogerAndDominic", # page does not follow standard layout
"TheAvatar", # page does not follow standard layout
"Michikomonogatari", # page does not follow standard layout
"DungeonDamage", # page does not follow standard layout
"SaveMeGebus", # page does not follow standard layout
"BlueZombie", # broken page
"BoomerExpress", # redirection to another page
"FaultyLogic", # page does not follow standard layout
"EarthRiser", # redirects to a new page
"GoForIt", # page is gone
"ACDeceptibotscomic", # no images
"TheAvatar", # page does not follow standard layout
]
# links to last valid strips
url_overrides = {
# link to last valid strip
"BallofYarn": "http://ballofyarn.comicgenesis.com/d/20020624.html",
"AmazonSpaceRangers": "http://amazons.comicgenesis.com/d/20051015.html",
"ArroganceinSimplicity": "http://arrogance.comicgenesis.com/d/20030217.html",
"ATasteofEvil": "http://atasteofevil.comicgenesis.com/d/20050314.html",
"": "",
"": "",
}
def handle_url(url, res):

View file

@ -33,30 +33,31 @@ class _ComicTester(TestCase):
# at least 5 strips from the start, and find strip images
# on at least 4 pages.
scraperobj = self.scraperclass()
num = empty = 0
num = 0
max_strips = 5
for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
images = 0
for image in strip.getImages():
images += 1
self.save(image)
if images == 0:
empty += 1
self.check(images > 0, 'failed to find images at %s' % strip.stripUrl)
if not self.scraperclass.multipleImagesPerStrip:
self.check(images == 1, 'found %d instead of 1 image at %s' % (images, strip.stripUrl))
if num > 0:
self.check_stripurl(strip)
num += 1
if self.scraperclass.prevSearch:
self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num)
# check that at exactly or for multiple pages at least 5 images are saved
self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern at %s.' % (num, strip.stripUrl))
# Check that exactly or for multiple pages at least 5 images are saved.
# This is different than the image number check above since it checks saved files,
# ie. it detects duplicate filenames.
saved_images = self.get_saved_images()
num_images = len(saved_images)
attrs = (num_images, saved_images, max_strips, self.tmpdir)
if self.scraperclass.multipleImagesPerStrip:
self.check(num_images >= max_strips,
'saved %d %s instead of at least %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
self.check(num_images >= max_strips, 'saved %d %s instead of at least %d images in %s' % attrs)
else:
self.check(num_images == max_strips,
'saved %d %s instead of %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty)
self.check(num_images == max_strips, 'saved %d %s instead of %d images in %s' % attrs)
def check_stripurl(self, strip):
if not self.scraperclass.stripUrl: