Read starter parameters from class.

This allows to specify starters in a more declarative and dynamic way.
This commit is contained in:
Tobias Gruetzmacher 2016-04-12 23:11:39 +02:00
parent b865a171f9
commit 42e43fa4e6
23 changed files with 186 additions and 140 deletions

View file

@ -1,8 +1,13 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from .util import getQueryParams from .util import getQueryParams
def queryNamer(paramName, usePageUrl=False): def queryNamer(paramName, usePageUrl=False):
"""Get name from URL query part.""" """Get name from URL query part."""
@classmethod @classmethod
@ -25,23 +30,32 @@ def regexNamer(regex, usePageUrl=False):
return _namer return _namer
def bounceStarter(url, nextSearch): def bounceStarter():
"""Get start URL by "bouncing" back and forth one time.""" """Get start URL by "bouncing" back and forth one time.
This needs the url and nextSearch properties be defined on the class.
"""
@classmethod @classmethod
def _starter(cls): def _starter(cls):
"""Get bounced start URL.""" """Get bounced start URL."""
data = cls.getPage(url) data = cls.getPage(cls.url)
url1 = cls.fetchUrl(url, data, cls.prevSearch) url1 = cls.fetchUrl(cls.url, data, cls.prevSearch)
data = cls.getPage(url1) data = cls.getPage(url1)
return cls.fetchUrl(url1, data, nextSearch) return cls.fetchUrl(url1, data, cls.nextSearch)
return _starter return _starter
def indirectStarter(url, latestSearch): def indirectStarter():
"""Get start URL by indirection.""" """Get start URL by indirection.
This is useful for comics where the latest comic can't be reached at a
stable URL. If the class has an attribute 'startUrl', this page is fetched
first, otherwise the page at 'url' is fetched. After that, the attribute
'latestSearch' is used on the page content to find the latest strip."""
@classmethod @classmethod
def _starter(cls): def _starter(cls):
"""Get indirect start URL.""" """Get indirect start URL."""
url = cls.startUrl if hasattr(cls, "startUrl") else cls.url
data = cls.getPage(url) data = cls.getPage(url)
return cls.fetchUrl(url, data, latestSearch) return cls.fetchUrl(url, data, cls.latestSearch)
return _starter return _starter

View file

@ -16,8 +16,7 @@ from .common import _WordPressScraper, xpath_class, WP_LATEST_SEARCH
class AbstruseGoose(_BasicScraper): class AbstruseGoose(_BasicScraper):
url = 'http://abstrusegoose.com/' url = 'http://abstrusegoose.com/'
rurl = escape(url) rurl = escape(url)
starter = bounceStarter( starter = bounceStarter()
url, compile(tagre('a', 'href', r'(%s\d+)' % rurl) + "Next »"))
stripUrl = url + '%s' stripUrl = url + '%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre('img', 'src', imageSearch = compile(tagre('img', 'src',
@ -81,7 +80,6 @@ class AfterStrife(_WordPressScraper):
class AGirlAndHerFed(_BasicScraper): class AGirlAndHerFed(_BasicScraper):
url = 'http://www.agirlandherfed.com/' url = 'http://www.agirlandherfed.com/'
starter = bounceStarter(url, compile(r'<a href="([^"]+)">[^>]+Back'))
stripUrl = url + '1.%s.html' stripUrl = url + '1.%s.html'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(img/strip/[^"]+\.jpg)')) imageSearch = compile(tagre("img", "src", r'(img/strip/[^"]+\.jpg)'))
@ -114,7 +112,6 @@ class ALessonIsLearned(_BasicScraper):
url = 'http://www.alessonislearned.com/' url = 'http://www.alessonislearned.com/'
prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=\d+)", prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=\d+)",
quote="'") + r"[^>]+previous") quote="'") + r"[^>]+previous")
starter = indirectStarter(url, prevSearch)
stripUrl = url + 'index.php?comic=%s' stripUrl = url + 'index.php?comic=%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r"(cmx/lesson\d+\.[a-z]+)")) imageSearch = compile(tagre("img", "src", r"(cmx/lesson\d+\.[a-z]+)"))
@ -124,8 +121,8 @@ class ALessonIsLearned(_BasicScraper):
class Alice(_WordPressScraper): class Alice(_WordPressScraper):
url = 'http://www.alicecomics.com/' url = 'http://www.alicecomics.com/'
prevSearch = '//a[%s]' % xpath_class('navi-prev-in') prevSearch = '//a[%s]' % xpath_class('navi-prev-in')
starter = indirectStarter('http://www.alicecomics.com/', latestSearch = '//a[text()="Latest Alice!"]'
'//a[text()="Latest Alice!"]') starter = indirectStarter()
class AlienLovesPredator(_BasicScraper): class AlienLovesPredator(_BasicScraper):
@ -264,7 +261,8 @@ class ARedTailsDream(_BasicScraper):
class Ashes(_WordPressScraper): class Ashes(_WordPressScraper):
url = 'http://www.flowerlarkstudios.com/comic/prologue/10232009/' url = 'http://www.flowerlarkstudios.com/comic/prologue/10232009/'
firstStripUrl = url firstStripUrl = url
starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH) latestSearch = WP_LATEST_SEARCH
starter = indirectStarter()
class ASkeweredParadise(_BasicScraper): class ASkeweredParadise(_BasicScraper):
@ -289,12 +287,13 @@ class ASofterWorld(_ParserScraper):
class AstronomyPOTD(_ParserScraper): class AstronomyPOTD(_ParserScraper):
baseUrl = 'http://apod.nasa.gov/apod/' baseUrl = 'http://apod.nasa.gov/apod/'
url = baseUrl + 'astropix.html' url = baseUrl + 'astropix.html'
starter = bounceStarter(url, '//a[text()=">"]') starter = bounceStarter()
stripUrl = baseUrl + 'ap%s.html' stripUrl = baseUrl + 'ap%s.html'
firstStripUrl = stripUrl % '061012' firstStripUrl = stripUrl % '061012'
imageSearch = '//a/img' imageSearch = '//a/img'
multipleImagesPerStrip = True multipleImagesPerStrip = True
prevSearch = '//a[text()="<"]' prevSearch = '//a[text()="<"]'
nextSearch = '//a[text()=">"]'
help = 'Index format: yymmdd' help = 'Index format: yymmdd'
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):

View file

@ -47,8 +47,8 @@ class BalderDash(_ComicControlScraper):
class Bardsworth(_WordPressScraper): class Bardsworth(_WordPressScraper):
url = 'http://www.bardsworth.com/' url = 'http://www.bardsworth.com/'
starter = indirectStarter('http://www.bardsworth.com/', latestSearch = '//a[@rel="bookmark"]'
'//a[@rel="bookmark"]') starter = indirectStarter()
class Baroquen(_BasicScraper): class Baroquen(_BasicScraper):
@ -72,12 +72,15 @@ class Beetlebum(_BasicScraper):
rurl = escape(url) rurl = escape(url)
stripUrl = url + '%s' stripUrl = url + '%s'
firstStripUrl = stripUrl % '2006/03/10/quiz-fur-ruskiphile' firstStripUrl = stripUrl % '2006/03/10/quiz-fur-ruskiphile'
starter = indirectStarter(url, compile(tagre('a', 'href', r'(%s\d{4}/\d{2}/\d{2}/[^"]+)' % rurl, after='bookmark'))) starter = indirectStarter()
multipleImagesPerStrip = True multipleImagesPerStrip = True
imageSearch = compile(tagre('img', 'src', r'(http://blog\.beetlebum\.de/wp-content/uploads/[^"]+)')) imageSearch = compile(tagre('img', 'src', r'(http://blog\.beetlebum\.de/wp-content/uploads/[^"]+)'))
prevSearch = compile(tagre('a', 'href', prevSearch = compile(tagre('a', 'href',
r'(%s\d{4}/\d{2}/\d{2}/[^"]*)' % rurl, r'(%s\d{4}/\d{2}/\d{2}/[^"]*)' % rurl,
after='prev')) after='prev'))
latestSearch = compile(tagre('a', 'href',
r'(%s\d{4}/\d{2}/\d{2}/[^"]+)' % rurl,
after='bookmark'))
help = 'Index format: yyyy/mm/dd/striptitle' help = 'Index format: yyyy/mm/dd/striptitle'
lang = 'de' lang = 'de'
@ -223,7 +226,8 @@ class BoredAndEvil(_BasicScraper):
firstStripUrl = stripUrl % '2004-06-07' firstStripUrl = stripUrl % '2004-06-07'
imageSearch = compile(tagre("img", "src", r'(strips/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(strips/[^"]+)'))
prevSearch = compile(r'First Comic.+<a href="(.+?)".+previous-on.gif') prevSearch = compile(r'First Comic.+<a href="(.+?)".+previous-on.gif')
starter = indirectStarter(url, prevSearch) latestSearch = prevSearch
starter = indirectStarter()
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'

View file

@ -55,7 +55,7 @@ class Carciphona(_BasicScraper):
after="prevarea")) after="prevarea"))
latestSearch = compile(tagre("a", "href", latestSearch = compile(tagre("a", "href",
r'(view\.php\?page=[0-9]+[^"]*)')) r'(view\.php\?page=[0-9]+[^"]*)'))
starter = indirectStarter(url, latestSearch) starter = indirectStarter()
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):
@ -275,10 +275,11 @@ class CoolCatStudio(_BasicScraper):
class CorydonCafe(_ParserScraper): class CorydonCafe(_ParserScraper):
url = 'http://corydoncafe.com/' url = 'http://corydoncafe.com/'
starter = indirectStarter(url, '//ul//a') starter = indirectStarter()
stripUrl = url + '%s.php' stripUrl = url + '%s.php'
imageSearch = "//center[2]//img" imageSearch = "//center[2]//img"
prevSearch = '//a[@title="prev"]' prevSearch = '//a[@title="prev"]'
latestSearch = '//ul//a'
help = 'Index format: yyyy/stripname' help = 'Index format: yyyy/stripname'
@classmethod @classmethod
@ -345,14 +346,15 @@ class CucumberQuest(_BasicScraper):
rurl = escape(url) rurl = escape(url)
stripUrl = url + 'cq/%s/' stripUrl = url + 'cq/%s/'
firstStripUrl = stripUrl % 'page-1' firstStripUrl = stripUrl % 'page-1'
starter = indirectStarter(url + 'recent.html', startUrl = url + 'recent.html'
compile(r'window\.location="(/cq/[^"]+/)"')) starter = indirectStarter()
imageSearch = ( imageSearch = (
compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/\d+[^"]+)' % rurl)), compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/\d+[^"]+)' % rurl)),
compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/ch\d+[^"]+)' % rurl)), compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/ch\d+[^"]+)' % rurl)),
compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/bonus[^"]+)' % rurl)), compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/bonus[^"]+)' % rurl)),
) )
prevSearch = compile(tagre("a", "href", r'(%scq/[^"]+/)' % rurl, after="previous")) prevSearch = compile(tagre("a", "href", r'(%scq/[^"]+/)' % rurl, after="previous"))
latestSearch = compile(r'window\.location="(/cq/[^"]+/)"')
help = 'Index format: stripname' help = 'Index format: stripname'
@ -377,11 +379,12 @@ class Curvy(_ParserScraper):
class CyanideAndHappiness(_BasicScraper): class CyanideAndHappiness(_BasicScraper):
url = 'http://www.explosm.net/comics/' url = 'http://www.explosm.net/comics/'
starter = bounceStarter(url, compile(tagre("a", "href", r"(/comics/\d+/)", after="next-comic"))) starter = bounceStarter()
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % '15' firstStripUrl = stripUrl % '15'
imageSearch = compile(tagre("img", "src", r'(//files.explosm.net/comics/[^"]+)', before="main-comic")) imageSearch = compile(tagre("img", "src", r'(//files.explosm.net/comics/[^"]+)', before="main-comic"))
prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', after="previous-comic")) prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', after="previous-comic"))
nextSearch = compile(tagre("a", "href", r"(/comics/\d+/)", after="next-comic"))
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):

View file

@ -13,17 +13,12 @@ from ..util import tagre
from .common import _WordPressScraper, xpath_class from .common import _WordPressScraper, xpath_class
class DailyDose(_BasicScraper): class DailyDose(_ParserScraper):
url = 'http://dailydoseofcomics.com/' url = 'http://dailydoseofcomics.com/'
starter = indirectStarter( starter = indirectStarter()
url, compile(tagre("a", "href", imageSearch = '//p/a/img'
r'(http://dailydoseofcomics\.com/[^"]+)', prevSearch = '//a[@rel="prev"]'
after="preview"))) latestSearch = '//a[@rel="bookmark"]'
stripUrl = url + '%s/'
imageSearch = compile(tagre("img", "src", r'([^"]+)',
before="align(?:none|center)"))
prevSearch = compile(tagre("a", "href", r'(http://dailydoseofcomics\.com/[^"]+)', after="prev"))
help = 'Index format: stripname'
class DamnLol(_BasicScraper): class DamnLol(_BasicScraper):
@ -31,13 +26,13 @@ class DamnLol(_BasicScraper):
rurl = escape(url) rurl = escape(url)
stripUrl = url + '%s.html' stripUrl = url + '%s.html'
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev")) prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
nextSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="next"))
imageSearch = ( imageSearch = (
compile(tagre("img", "src", r'(%si/[^"]+)' % rurl)), compile(tagre("img", "src", r'(%si/[^"]+)' % rurl)),
compile(tagre("img", "src", r'(%spics/[^"]+)' % rurl)), compile(tagre("img", "src", r'(%spics/[^"]+)' % rurl)),
) )
help = 'Index format: stripname-number' help = 'Index format: stripname-number'
starter = bounceStarter( starter = bounceStarter()
url, compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="next")))
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):
@ -160,9 +155,12 @@ class Dilbert(_BasicScraper):
url = 'http://dilbert.com/' url = 'http://dilbert.com/'
stripUrl = url + '/strip/%s/' stripUrl = url + '/strip/%s/'
firstStripUrl = stripUrl % '1989-04-16' firstStripUrl = stripUrl % '1989-04-16'
starter = indirectStarter(url, compile(tagre("a", "href", r'(http://dilbert.com/strip/[0-9-]*)', after="Click to see"))) starter = indirectStarter()
prevSearch = compile(tagre("a", "href", r'(/strip/\d+-\d+-\d+)', after="Older Strip")) prevSearch = compile(tagre("a", "href", r'(/strip/\d+-\d+-\d+)', after="Older Strip"))
imageSearch = compile(tagre("img", "src", r'(http://assets.amuniversal.com/\w+)')) imageSearch = compile(tagre("img", "src", r'(http://assets.amuniversal.com/\w+)'))
latestSearch = compile(tagre("a", "href",
r'(http://dilbert.com/strip/[0-9-]*)',
after="Click to see"))
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
@classmethod @classmethod
@ -254,9 +252,10 @@ class DresdenCodak(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl) + prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl) +
tagre("img", "src", r"%sm_prev2?\.png" % rurl, tagre("img", "src", r"%sm_prev2?\.png" % rurl,
quote="")) quote=""))
starter = indirectStarter( latestSearch = compile(tagre("div", "id", "preview") +
url, compile(tagre("div", "id", "preview") + tagre("a", "href",
tagre("a", "href", r'(%s\d+/\d+/\d+/[^"]+)' % rurl))) r'(%s\d+/\d+/\d+/[^"]+)' % rurl))
starter = indirectStarter()
class DrFun(_BasicScraper): class DrFun(_BasicScraper):

View file

@ -15,9 +15,10 @@ from .common import _WordPressScraper, WP_LATEST_SEARCH, xpath_class
class EarthsongSaga(_ParserScraper): class EarthsongSaga(_ParserScraper):
url = 'http://earthsongsaga.com/index.php' url = 'http://earthsongsaga.com/index.php'
starter = indirectStarter(url, '//div[@id="leftmenu"]/span[1]/a[1]') starter = indirectStarter()
imageSearch = '//div[@id="comic"]//img' imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[@title="Previous"]' prevSearch = '//a[@title="Previous"]'
latestSearch = '//div[@id="leftmenu"]/span[1]/a[1]'
@classmethod @classmethod
def fetchUrls(cls, url, data, urlSearch): def fetchUrls(cls, url, data, urlSearch):
@ -43,21 +44,23 @@ class EarthsongSaga(_ParserScraper):
class EasilyAmused(_WordPressScraper): class EasilyAmused(_WordPressScraper):
url = 'http://www.flowerlarkstudios.com/comic/college-daze/ea01/' url = 'http://www.flowerlarkstudios.com/comic/college-daze/ea01/'
firstStripUrl = url firstStripUrl = url
starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH) latestSearch = WP_LATEST_SEARCH
starter = indirectStarter()
class EatLiver(_BasicScraper): class EatLiver(_BasicScraper):
url = 'http://www.eatliver.com/' url = 'http://www.eatliver.com/'
rurl = escape(url) rurl = escape(url)
starter = indirectStarter(url, compile( starter = indirectStarter()
tagre("a", "href", r'(i\.php\?n=\d+)') +
tagre("img", "src", r'img/small/[^"]+') + r"</a>\s*<br"))
stripUrl = url + "i.php?n=%s" stripUrl = url + "i.php?n=%s"
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("link", "href", r'(%simg/\d+/[^"]+)' % rurl, imageSearch = compile(tagre("link", "href", r'(%simg/\d+/[^"]+)' % rurl,
before="image_src")) before="image_src"))
prevSearch = compile(tagre("a", "href", r'(i\.php\?n=\d+)') + prevSearch = compile(tagre("a", "href", r'(i\.php\?n=\d+)') +
"&#060;&#060; Previous") "&#060;&#060; Previous")
latestSearch = compile(tagre("a", "href", r'(i\.php\?n=\d+)') +
tagre("img", "src", r'img/small/[^"]+') +
r"</a>\s*<br")
class EatThatToast(_BasicScraper): class EatThatToast(_BasicScraper):
@ -181,7 +184,8 @@ class Erstwhile(_WordPressScraper):
class Eryl(_WordPressScraper): class Eryl(_WordPressScraper):
url = 'http://www.flowerlarkstudios.com/comic/prologue-migration/page-i/' url = 'http://www.flowerlarkstudios.com/comic/prologue-migration/page-i/'
firstStripUrl = url firstStripUrl = url
starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH) latestSearch = WP_LATEST_SEARCH
starter = indirectStarter()
help = 'This was known as DarkWings in previous Dosage versions' help = 'This was known as DarkWings in previous Dosage versions'

View file

@ -26,10 +26,9 @@ class FantasyRealms(_BasicScraper):
stripUrl = url + 'manga/%s.php' stripUrl = url + 'manga/%s.php'
imageSearch = compile(r'<img src="(\d{1,4}.\w{3,4})" width="540"', IGNORECASE) imageSearch = compile(r'<img src="(\d{1,4}.\w{3,4})" width="540"', IGNORECASE)
prevSearch = compile(r'<a href="(.+?)"><img src="../images/nav-back.gif"', IGNORECASE) prevSearch = compile(r'<a href="(.+?)"><img src="../images/nav-back.gif"', IGNORECASE)
latestSearch = compile(r'<a href="(manga/.+?)"><img src="preview.jpg"', IGNORECASE)
help = 'Index format: nnn' help = 'Index format: nnn'
starter = indirectStarter( starter = indirectStarter()
url,
compile(r'<a href="(manga/.+?)"><img src="preview.jpg"', IGNORECASE))
class FauxPas(_BasicScraper): class FauxPas(_BasicScraper):
@ -47,8 +46,9 @@ class FeyWinds(_BasicScraper):
stripUrl = baseUrl + 'comic/page.php?id=%s' stripUrl = baseUrl + 'comic/page.php?id=%s'
imageSearch = compile(r"(../comic/pages//.+?)'") imageSearch = compile(r"(../comic/pages//.+?)'")
prevSearch = compile(r"(page.php\?id=.+?)'.+?navprevious.png") prevSearch = compile(r"(page.php\?id=.+?)'.+?navprevious.png")
latestSearch = compile(r'(comic/page.php\?id.+?)"')
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
starter = indirectStarter(url, compile(r'(comic/page.php\?id.+?)"')) starter = indirectStarter()
class FilibusterCartoons(_BasicScraper): class FilibusterCartoons(_BasicScraper):
@ -159,9 +159,9 @@ class FredoAndPidjin(_BasicScraper):
) )
multipleImagesPerStrip = True multipleImagesPerStrip = True
prevSearch = compile(tagre('a', 'href', '([^"]+)') + "Prev</a>") prevSearch = compile(tagre('a', 'href', '([^"]+)') + "Prev</a>")
starter = indirectStarter( latestSearch = compile(tagre('a', 'href', "(" + url +
url, r'\d\d\d\d/\d\d/\d\d/[^"]+/)'))
compile(tagre('a', 'href', "(" + url + r'\d\d\d\d/\d\d/\d\d/[^"]+/)'))) starter = indirectStarter()
class Freefall(_BasicScraper): class Freefall(_BasicScraper):

View file

@ -27,15 +27,15 @@ class Garanos(_BasicScraper):
baseUrl = 'http://garanos.alexheberling.com/' baseUrl = 'http://garanos.alexheberling.com/'
rurl = escape(baseUrl) rurl = escape(baseUrl)
url = baseUrl + 'pages/page-1/' url = baseUrl + 'pages/page-1/'
starter = indirectStarter( starter = indirectStarter()
url, compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
after="nav-last")))
stripUrl = baseUrl + 'pages/page-%s' stripUrl = baseUrl + 'pages/page-%s'
imageSearch = compile( imageSearch = compile(
tagre("img", "src", tagre("img", "src",
r'(%swp-content/uploads/sites/\d+/\d+/\d+/[^"]+)' % rurl)) r'(%swp-content/uploads/sites/\d+/\d+/\d+/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl, prevSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
after="prev")) after="prev"))
latestSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
after="nav-last"))
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
@ -136,14 +136,14 @@ class GoGetARoomie(_ComicControlScraper):
class GoneWithTheBlastwave(_BasicScraper): class GoneWithTheBlastwave(_BasicScraper):
url = 'http://www.blastwave-comic.com/index.php?p=comic&nro=1' url = 'http://www.blastwave-comic.com/index.php?p=comic&nro=1'
starter = indirectStarter( starter = indirectStarter()
url, compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' +
r'<img src="images/page/default/latest'))
stripUrl = url[:-1] + '%s' stripUrl = url[:-1] + '%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(r'<img.+src=".+(/comics/.+?)"') imageSearch = compile(r'<img.+src=".+(/comics/.+?)"')
prevSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' + prevSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' +
r'<img src="images/page/default/previous') r'<img src="images/page/default/previous')
latestSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' +
r'<img src="images/page/default/latest')
help = 'Index format: n' help = 'Index format: n'
@classmethod @classmethod

View file

@ -41,15 +41,15 @@ class _HappyJar(_WordPressScraper):
class HarkAVagrant(_BasicScraper): class HarkAVagrant(_BasicScraper):
url = 'http://www.harkavagrant.com/' url = 'http://www.harkavagrant.com/'
rurl = escape(url) rurl = escape(url)
starter = bounceStarter( starter = bounceStarter()
url, compile(tagre("a", "href", r'(%sindex\.php\?id=\d+)' % rurl) +
tagre("img", "src", "buttonnext.png")))
stripUrl = url + 'index.php?id=%s' stripUrl = url + 'index.php?id=%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(%s[^"]+)' % rurl, imageSearch = compile(tagre("img", "src", r'(%s[^"]+)' % rurl,
after='BORDER')) after='BORDER'))
prevSearch = compile(tagre("a", "href", r'(%sindex\.php\?id=\d+)' % rurl) + prevSearch = compile(tagre("a", "href", r'(%sindex\.php\?id=\d+)' % rurl) +
tagre("img", "src", "buttonprevious.png")) tagre("img", "src", "buttonprevious.png"))
nextSearch = compile(tagre("a", "href", r'(%sindex\.php\?id=\d+)' % rurl) +
tagre("img", "src", "buttonnext.png"))
help = 'Index format: number' help = 'Index format: number'
@classmethod @classmethod

View file

@ -1,8 +1,12 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..util import tagre from ..util import tagre
from ..helpers import indirectStarter from ..helpers import indirectStarter
@ -30,9 +34,12 @@ class JerkCity(_BasicScraper):
class JimBenton(_BasicScraper): class JimBenton(_BasicScraper):
url = 'http://www.jimbenton.com/page14/page14.html' url = 'http://www.jimbenton.com/page14/page14.html'
stripUrl = 'http://www.jimbenton.com/page14/files/JimBentonComic-%s.html' stripUrl = 'http://www.jimbenton.com/page14/files/JimBentonComic-%s.html'
starter = indirectStarter(url, compile(tagre("a", "href", r'(files/JimBentonComic-[^>]+\.html)', quote=""))) starter = indirectStarter()
imageSearch = compile(tagre("img", "src", r'(JimBentonComic-[^"]+)', before="photo-frame")) imageSearch = compile(tagre("img", "src", r'(JimBentonComic-[^"]+)',
prevSearch = compile(tagre("a", "href", r'(JimBentonComic-[^>]+\.html)', quote="") + "Next") before="photo-frame"))
prevSearch = compile(tagre("a", "href", r'(JimBentonComic-[^>]+\.html)',
quote="") + "Next")
latestSearch = compile(tagre("a", "href", r'(files/JimBentonComic-[^>]+\.html)', quote=""))
help = 'Index format: stripname' help = 'Index format: stripname'
@ -58,6 +65,7 @@ class JustAnotherEscape(_BasicScraper):
rurl = escape(url) rurl = escape(url)
stripUrl = url + 'index.cgi?date=%s' stripUrl = url + 'index.cgi?date=%s'
imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s/index\.cgi\?date=\d+)' % rurl) prevSearch = compile(tagre("a", "href",
+ tagre("img", "alt", "Previous Comic")) r'(%s/index\.cgi\?date=\d+)' % rurl) +
tagre("img", "alt", "Previous Comic"))
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'

View file

@ -9,7 +9,6 @@ from re import compile, escape, IGNORECASE
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..util import tagre from ..util import tagre
from ..helpers import indirectStarter
from .common import _ComicControlScraper, _WordPressScraper, xpath_class from .common import _ComicControlScraper, _WordPressScraper, xpath_class
@ -81,4 +80,3 @@ class KuroShouri(_BasicScraper):
tagre("a", "href", r'(%s\?webcomic_post\=[^"]+)' % rurl, tagre("a", "href", r'(%s\?webcomic_post\=[^"]+)' % rurl,
after="previous")) after="previous"))
help = 'Index format: chapter-n-page-m' help = 'Index format: chapter-n-page-m'
starter = indirectStarter(url, prevSearch)

View file

@ -21,10 +21,10 @@ class Lackadaisy(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(http://www\.lackadaisycats\.com/comic/[^"]*)')) imageSearch = compile(tagre("img", "src", r'(http://www\.lackadaisycats\.com/comic/[^"]*)'))
prevSearch = compile(tagre("a", "href", r"(/comic\.php\?comicid=[0-9]+)") + prevSearch = compile(tagre("a", "href", r"(/comic\.php\?comicid=[0-9]+)") +
"&lt; Previous") "&lt; Previous")
nextSearch = compile(tagre("a", "href", r"(/comic.php\?comicid=[0-9]+)") +
"Next")
help = 'Index format: n' help = 'Index format: n'
starter = bounceStarter( starter = bounceStarter()
url, compile(tagre("a", "href", r"(/comic.php\?comicid=[0-9]+)") +
"Next"))
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):
@ -37,7 +37,8 @@ class Lackadaisy(_BasicScraper):
class Laiyu(_WordPressScraper): class Laiyu(_WordPressScraper):
url = 'http://www.flowerlarkstudios.com/comic/preliminary-concepts/welcome/' url = 'http://www.flowerlarkstudios.com/comic/preliminary-concepts/welcome/'
firstStripUrl = url firstStripUrl = url
starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH) latestSearch = WP_LATEST_SEARCH
starter = indirectStarter()
class LasLindas(_BasicScraper): class LasLindas(_BasicScraper):
@ -64,9 +65,9 @@ class LeastICouldDo(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/\d{8,9}\.\w{1,4})' % rurl)) imageSearch = compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/\d{8,9}\.\w{1,4})' % rurl))
prevSearch = compile(tagre("a", "href", r'(%scomic/\d+/)' % rurl, prevSearch = compile(tagre("a", "href", r'(%scomic/\d+/)' % rurl,
after="Previous")) after="Previous"))
starter = indirectStarter( latestSearch = compile(tagre("a", "href", r'(%scomic/\d+/)' % rurl,
url, compile(tagre("a", "href", r'(%scomic/\d+/)' % rurl, after="feature-comic"))
after="feature-comic"))) starter = indirectStarter()
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@ -110,12 +111,11 @@ class LoadingArtist(_ParserScraper):
class LookingForGroup(_ParserScraper): class LookingForGroup(_ParserScraper):
url = 'http://www.lfgcomic.com/' url = 'http://www.lfgcomic.com/'
rurl = escape(url)
stripUrl = url + 'page/%s/' stripUrl = url + 'page/%s/'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
css = True css = True
imageSearch = '#comic img' imageSearch = '#comic img'
prevSearch = '#comic-left > a' prevSearch = '#comic-left > a'
starter = indirectStarter(url, '#header-dropdown-comic-lfg > a:nth-of-type(2)') latestSearch = '#header-dropdown-comic-lfg > a:nth-of-type(2)'
nameSearch = compile(r'/page/([-0-9]+)/') starter = indirectStarter()
help = 'Index format: nnn' help = 'Index format: nnn'

View file

@ -102,9 +102,9 @@ class NichtLustig(_BasicScraper):
lang = 'de' lang = 'de'
imageSearch = compile('background-image:url\((http://static\.nichtlustig\.de/comics/full/\d+\.jpg)') imageSearch = compile('background-image:url\((http://static\.nichtlustig\.de/comics/full/\d+\.jpg)')
prevSearch = compile(tagre("a", "href", r'(http://static\.nichtlustig\.de/toondb/\d+\.html)')) prevSearch = compile(tagre("a", "href", r'(http://static\.nichtlustig\.de/toondb/\d+\.html)'))
latestSearch = compile(tagre("a", "href", r'([^"]*toondb/\d+\.html)'))
help = 'Index format: yymmdd' help = 'Index format: yymmdd'
starter = indirectStarter( starter = indirectStarter()
url, compile(tagre("a", "href", r'([^"]*toondb/\d+\.html)')))
class Nicky510(_WordPressScraper): class Nicky510(_WordPressScraper):
@ -136,7 +136,8 @@ class NobodyScores(_BasicScraper):
class NoMoreSavePoints(_WordPressScraper): class NoMoreSavePoints(_WordPressScraper):
url = 'http://www.flowerlarkstudios.com/comic/no-more-save-points/mushroom-hopping/' url = 'http://www.flowerlarkstudios.com/comic/no-more-save-points/mushroom-hopping/'
firstStripUrl = url firstStripUrl = url
starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH) latestSearch = WP_LATEST_SEARCH
starter = indirectStarter()
class NoNeedForBushido(_BasicScraper): class NoNeedForBushido(_BasicScraper):
@ -149,10 +150,10 @@ class NoNeedForBushido(_BasicScraper):
after="attachment-full")) after="attachment-full"))
prevSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl, prevSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl,
after="previous-webcomic")) after="previous-webcomic"))
latestSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl,
after="last-webcomic"))
help = 'Index format: nnn' help = 'Index format: nnn'
starter = indirectStarter( starter = indirectStarter()
url, compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl,
after="last-webcomic")))
class NotInventedHere(_BasicScraper): class NotInventedHere(_BasicScraper):

View file

@ -8,7 +8,6 @@ from __future__ import absolute_import, division, print_function
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter
from ..util import tagre from ..util import tagre
from .common import _WordPressScraper, xpath_class from .common import _WordPressScraper, xpath_class
@ -53,7 +52,6 @@ class OkCancel(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(%sstrips/okcancel\d{8}\.gif)' % rurl)) imageSearch = compile(tagre("img", "src", r'(%sstrips/okcancel\d{8}\.gif)' % rurl))
prevSearch = compile(tagre("div", "class", "previous") + prevSearch = compile(tagre("div", "class", "previous") +
tagre("a", "href", r'(%scomic/\d{1,4}\.html)' % rurl)) tagre("a", "href", r'(%scomic/\d{1,4}\.html)' % rurl))
starter = indirectStarter(url, prevSearch)
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'

View file

@ -20,12 +20,13 @@ class PandyLand(_WordPressScraper):
class ParadigmShift(_BasicScraper): class ParadigmShift(_BasicScraper):
url = 'http://www.paradigmshiftmanga.com/' url = 'http://www.paradigmshiftmanga.com/'
starter = indirectStarter(url, compile(tagre("a", "href", r'([^"]+)', starter = indirectStarter()
after="next-comic-link")))
stripUrl = url + 'ps/%s.html' stripUrl = url + 'ps/%s.html'
imageSearch = compile(tagre("img", "src", r'([^"]*comics/ps/[^"]*)')) imageSearch = compile(tagre("img", "src", r'([^"]*comics/ps/[^"]*)'))
prevSearch = compile(tagre("a", "href", r'([^"]+)', prevSearch = compile(tagre("a", "href", r'([^"]+)',
after="previous-comic-link")) after="previous-comic-link"))
latestSearch = compile(tagre("a", "href", r'([^"]+)',
after="next-comic-link"))
help = 'Index format: custom' help = 'Index format: custom'
@ -72,7 +73,6 @@ class PennyAndAggie(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(http://www\.pennyandaggie\.com/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://www\.pennyandaggie\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r"(index\.php\?p\=\d+)", quote="'") + prevSearch = compile(tagre("a", "href", r"(index\.php\?p\=\d+)", quote="'") +
tagre("img", "src", r'%simages/previous_day\.gif' % rurl, quote="")) tagre("img", "src", r'%simages/previous_day\.gif' % rurl, quote=""))
starter = indirectStarter(url, prevSearch)
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
@ -162,11 +162,12 @@ class PicPakDog(_BasicScraper):
class PiledHigherAndDeeper(_BasicScraper): class PiledHigherAndDeeper(_BasicScraper):
url = 'http://www.phdcomics.com/comics.php' url = 'http://www.phdcomics.com/comics.php'
starter = bounceStarter(url, compile(r'<a href=(archive\.php\?comicid=\d+)>.*<img [^>]*next_button\.gif')) starter = bounceStarter()
stripUrl = url + '?comicid=%s' stripUrl = url + '?comicid=%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd\d+s\d?\.\w{3,4})', quote="")) imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd\d+s\d?\.\w{3,4})', quote=""))
prevSearch = compile(r'<a href=((comics/)?archive\.php\?comicid=\d+)>.*<img [^>]*prev_button\.gif') prevSearch = compile(r'<a href=((comics/)?archive\.php\?comicid=\d+)>.*<img [^>]*prev_button\.gif')
nextSearch = compile(r'<a href=(archive\.php\?comicid=\d+)>.*<img [^>]*next_button\.gif')
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
namer = queryNamer('comicid', usePageUrl=True) namer = queryNamer('comicid', usePageUrl=True)
@ -204,9 +205,9 @@ class PokeyThePenguin(_ParserScraper):
stripUrl = url + 'index%s.html' stripUrl = url + 'index%s.html'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = '//p/img' imageSearch = '//p/img'
prevSearch = True latestSearch = '(//a)[last()]'
multipleImagesPerStrip = True multipleImagesPerStrip = True
starter = indirectStarter(url, "(//a)[last()]") starter = indirectStarter()
help = 'Index format: number' help = 'Index format: number'
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
@ -230,22 +231,22 @@ class PoorlyDrawnLines(_BasicScraper):
class Precocious(_BasicScraper): class Precocious(_BasicScraper):
url = 'http://www.precociouscomic.com/' url = 'http://www.precociouscomic.com/'
starter = indirectStarter( starter = indirectStarter()
url, compile(tagre("a", "href", r'(/archive/comic/[^"]+)') +
tagre("img", "src", r"/templates/precocious_main/images/next_arrow\.png"))
)
stripUrl = url + 'archive/comic/%s' stripUrl = url + 'archive/comic/%s'
imageSearch = compile(tagre("img", "src", r'(/comics/\d+[^"]*\.(?:jpg|gif))')) imageSearch = compile(tagre("img", "src", r'(/comics/\d+[^"]*\.(?:jpg|gif))'))
prevSearch = compile(tagre("a", "href", r'(/archive/comic/[^"]+)') + tagre("img", "src", r"/templates/precocious_main/images/back_arrow\.png")) prevSearch = compile(tagre("a", "href", r'(/archive/comic/[^"]+)') + tagre("img", "src", r"/templates/precocious_main/images/back_arrow\.png"))
latestSearch = compile(tagre("a", "href", r'(/archive/comic/[^"]+)') +
tagre("img", "src", r"/templates/precocious_main/images/next_arrow\.png"))
help = 'Index format: yyyy/mm/dd' help = 'Index format: yyyy/mm/dd'
class PS238(_ParserScraper): class PS238(_ParserScraper):
url = 'http://ps238.nodwick.com/' url = 'http://ps238.nodwick.com/'
stripUrl = url + '/comic/%s/' stripUrl = url + '/comic/%s/'
starter = bounceStarter(url, '//a[@class="comic-nav-base comic-nav-next"]') starter = bounceStarter()
imageSearch = '//div[@id="comic"]//img' imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[@class="comic-nav-base comic-nav-previous"]' prevSearch = '//a[@class="comic-nav-base comic-nav-previous"]'
nextSearch = '//a[@class="comic-nav-base comic-nav-next"]'
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'

View file

@ -1,10 +1,13 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper
from ..scraper import _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import bounceStarter from ..helpers import bounceStarter
from ..util import tagre from ..util import tagre
@ -16,6 +19,7 @@ class RadioactivePanda(_BasicScraper):
prevSearch = compile(r'<a href="(/comic/.*?)".+?previous_btn') prevSearch = compile(r'<a href="(/comic/.*?)".+?previous_btn')
help = 'Index format: n (no padding)' help = 'Index format: n (no padding)'
class RalfTheDestroyer(_ParserScraper): class RalfTheDestroyer(_ParserScraper):
url = 'http://ralfthedestroyer.com/' url = 'http://ralfthedestroyer.com/'
stripUrl = url + '%s/' stripUrl = url + '%s/'
@ -47,12 +51,12 @@ class RealmOfAtland(_BasicScraper):
class RedMeat(_BasicScraper): class RedMeat(_BasicScraper):
baseUrl = 'http://www.redmeat.com/redmeat/' baseUrl = 'http://www.redmeat.com/redmeat/'
url = baseUrl + 'current/index.html' url = baseUrl + 'current/index.html'
starter = bounceStarter(url, starter = bounceStarter()
compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="next")))
stripUrl = baseUrl + '%s/index.html' stripUrl = baseUrl + '%s/index.html'
firstStripUrl = stripUrl % '1996-06-10' firstStripUrl = stripUrl % '1996-06-10'
imageSearch = compile(tagre("img", "src", r'(http://www\.redmeat\.com/imager/b/redmeat/[^"]*\.png)')) imageSearch = compile(tagre("img", "src", r'(http://www\.redmeat\.com/imager/b/redmeat/[^"]*\.png)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="prev")) prevSearch = compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="prev"))
nextSearch = compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="next"))
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
@ -81,7 +85,8 @@ class RomanticallyApocalyptic(_BasicScraper):
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(%sart/\d+[^"]+)' % rurl)) imageSearch = compile(tagre("img", "src", r'(%sart/\d+[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s\d+[^"]+)' % rurl)+"\s*"+tagre('span', 'class', 'spritePrevious')) prevSearch = compile(tagre("a", "href", r'(%s\d+[^"]+)' % rurl) + "\s*" +
tagre('span', 'class', 'spritePrevious'))
help = 'Index format: n' help = 'Index format: n'
adult = True adult = True
@ -101,5 +106,6 @@ class Ruthe(_BasicScraper):
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
lang = 'de' lang = 'de'
imageSearch = compile(tagre("img", "src", r'(/?cartoons/strip_\d+[^"]+)')) imageSearch = compile(tagre("img", "src", r'(/?cartoons/strip_\d+[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/cartoon/\d+/datum/asc/)')+'vorheriger') prevSearch = compile(tagre("a", "href", r'(/cartoon/\d+/datum/asc/)') +
'vorheriger')
help = 'Index format: number' help = 'Index format: number'

View file

@ -69,9 +69,10 @@ class ScandinaviaAndTheWorld(_ParserScraper):
url = 'http://satwcomic.com/' url = 'http://satwcomic.com/'
stripUrl = url + '%s' stripUrl = url + '%s'
firstStripUrl = stripUrl % 'sweden-denmark-and-norway' firstStripUrl = stripUrl % 'sweden-denmark-and-norway'
starter = indirectStarter(url, '//a[text()="View latest comic"]') starter = indirectStarter()
imageSearch = '//img[@itemprop="image"]' imageSearch = '//img[@itemprop="image"]'
prevSearch = '//a[@accesskey="p"]' prevSearch = '//a[@accesskey="p"]'
latestSearch = '//a[text()="View latest comic"]'
textSearch = '//span[@itemprop="articleBody"]' textSearch = '//span[@itemprop="articleBody"]'
help = 'Index format: stripname' help = 'Index format: stripname'
@ -194,9 +195,9 @@ class SexyLosers(_BasicScraper):
stripUrl = url + '%s.html' stripUrl = url + '%s.html'
imageSearch = compile(r'<img src\s*=\s*"\s*(comics/[\w\.]+?)"', IGNORECASE) imageSearch = compile(r'<img src\s*=\s*"\s*(comics/[\w\.]+?)"', IGNORECASE)
prevSearch = compile(r'<a href="(/\d{3}\.\w+?)"><font color = FFAAAA><<', IGNORECASE) prevSearch = compile(r'<a href="(/\d{3}\.\w+?)"><font color = FFAAAA><<', IGNORECASE)
latestSearch = compile(r'SEXY LOSERS <A HREF="(.+?)">Latest SL Comic \(#\d+\)</A>', IGNORECASE)
help = 'Index format: nnn' help = 'Index format: nnn'
starter = indirectStarter(url, starter = indirectStarter()
compile(r'SEXY LOSERS <A HREF="(.+?)">Latest SL Comic \(#\d+\)</A>', IGNORECASE))
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):
@ -333,7 +334,8 @@ class SnowFlame(_WordPressScraper):
url = 'http://www.snowflamecomic.com/' url = 'http://www.snowflamecomic.com/'
stripUrl = url + '?comic=snowflame-%s-%s' stripUrl = url + '?comic=snowflame-%s-%s'
firstStripUrl = stripUrl % ('01', '01') firstStripUrl = stripUrl % ('01', '01')
starter = bounceStarter(url, WP_LATEST_SEARCH) starter = bounceStarter()
nextSearch = WP_LATEST_SEARCH
help = 'Index format: chapter-page' help = 'Index format: chapter-page'
def getIndexStripUrl(self, index): def getIndexStripUrl(self, index):
@ -392,8 +394,9 @@ class Spamusement(_BasicScraper):
imageSearch = compile(r'<img src="(%sgfx/\d+\..+?)"' % rurl, IGNORECASE) imageSearch = compile(r'<img src="(%sgfx/\d+\..+?)"' % rurl, IGNORECASE)
prevSearch = compile(r'<a href="(%sindex.php/comics/view/.+?)">' % rurl, prevSearch = compile(r'<a href="(%sindex.php/comics/view/.+?)">' % rurl,
IGNORECASE) IGNORECASE)
latestSearch = prevSearch
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
starter = indirectStarter(url, prevSearch) starter = indirectStarter()
class SpareParts(_BasicScraper): class SpareParts(_BasicScraper):
@ -504,8 +507,7 @@ class StuffNoOneToldMe(_BasicScraper):
stripUrl = url + '%s.html' stripUrl = url + '%s.html'
firstStripUrl = stripUrl % '2010/05/01' firstStripUrl = stripUrl % '2010/05/01'
olderHref = r"(http://www\.snotm\.com/\d+/\d+/[^']+\.html)" olderHref = r"(http://www\.snotm\.com/\d+/\d+/[^']+\.html)"
starter = indirectStarter( starter = indirectStarter()
url, compile(tagre("a", "href", olderHref, quote="'")))
imageSearch = ( imageSearch = (
compile(tagre("img", "src", r'(http://i\.imgur\.com/[^"]+)') + compile(tagre("img", "src", r'(http://i\.imgur\.com/[^"]+)') +
r"(?:</a>|<br />)"), r"(?:</a>|<br />)"),
@ -515,6 +517,7 @@ class StuffNoOneToldMe(_BasicScraper):
) )
prevSearch = compile(tagre("a", "href", olderHref, quote="'", prevSearch = compile(tagre("a", "href", olderHref, quote="'",
before="older-link")) before="older-link"))
latestSearch = compile(tagre("a", "href", olderHref, quote="'"))
multipleImagesPerStrip = True multipleImagesPerStrip = True
help = 'Index format: yyyy/mm/stripname' help = 'Index format: yyyy/mm/stripname'

View file

@ -76,14 +76,14 @@ class TheNoob(_BasicScraper):
class TheOrderOfTheStick(_BasicScraper): class TheOrderOfTheStick(_BasicScraper):
baseUrl = 'http://www.giantitp.com/' url = 'http://www.giantitp.com/'
url = baseUrl + 'comics/oots0863.html' stripUrl = url + 'comics/oots%s.html'
stripUrl = baseUrl + 'comics/oots%s.html'
firstStripUrl = stripUrl % '0001' firstStripUrl = stripUrl % '0001'
imageSearch = compile(r'<IMG src="(/comics/images/[^"]+)">') imageSearch = compile(r'<IMG src="(/comics/images/[^"]+)">')
prevSearch = compile(r'<A href="(/comics/oots\d{4}\.html)"><IMG src="/Images/redesign/ComicNav_Back.gif"') prevSearch = compile(r'<A href="(/comics/oots\d{4}\.html)"><IMG src="/Images/redesign/ComicNav_Back.gif"')
latestSearch = compile(r'<A href="(/comics/oots\d{4}\.html)"')
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
starter = indirectStarter(baseUrl, compile(r'<A href="(/comics/oots\d{4}\.html)"')) starter = indirectStarter()
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):
@ -116,9 +116,13 @@ class TheThinHLine(_BasicScraper):
rurl = escape(url) rurl = escape(url)
stripUrl = url + 'post/%s' stripUrl = url + 'post/%s'
firstStripUrl = stripUrl % '3517345105' firstStripUrl = stripUrl % '3517345105'
imageSearch = compile(tagre('img', 'data-src', r'([^"]+media.tumblr.com/[^"]+)', before='content-image')) imageSearch = compile(tagre('img', 'data-src',
r'([^"]+media.tumblr.com/[^"]+)',
before='content-image'))
prevSearch = compile(tagre("a", "href", r'([^"]+)') + '&gt;</a>') prevSearch = compile(tagre("a", "href", r'([^"]+)') + '&gt;</a>')
starter = indirectStarter(url, compile(tagre("a", "href", r'([^"]+)', after='class="timestamp"'))) latestSearch = compile(tagre("a", "href", r'([^"]+)',
after='class="timestamp"'))
starter = indirectStarter()
adult = True adult = True
indirectImageSearch = compile(tagre('a', 'href', r'(%simage/\d+)' % rurl)) indirectImageSearch = compile(tagre('a', 'href', r'(%simage/\d+)' % rurl))

View file

@ -21,12 +21,10 @@ class Underling(_WordPressScraper):
class Undertow(_BasicScraper): class Undertow(_BasicScraper):
url = 'http://undertow.dreamshards.org/' url = 'http://undertow.dreamshards.org/'
stripUrl = url + '%s'
imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)')) imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)'))
prevSearch = compile(r'href="(.+?)".+?teynpoint') prevSearch = compile(r'href="(.+?)".+?teynpoint')
help = 'Index format: good luck !' latestSearch = compile(r'href="(.+?)".+?Most recent page')
starter = indirectStarter(url, starter = indirectStarter()
compile(r'href="(.+?)".+?Most recent page'))
class UnicornJelly(_BasicScraper): class UnicornJelly(_BasicScraper):
@ -46,9 +44,10 @@ class Unsounded(_BasicScraper):
rurl = escape(url) rurl = escape(url)
imageSearch = compile(tagre("img", "src", r'(pageart/[^"]*)')) imageSearch = compile(tagre("img", "src", r'(pageart/[^"]*)'))
prevSearch = compile(tagre("a", "href", r'([^"]*)', after='class="back')) prevSearch = compile(tagre("a", "href", r'([^"]*)', after='class="back'))
starter = indirectStarter( latestSearch = compile(tagre("a", "href", r'(%scomic/[^"]*)' % rurl) +
url, compile(tagre("a", "href", r'(%scomic/[^"]*)' % rurl) + tagre("img", "src",
tagre("img", "src", r"%simages/newpages\.png" % rurl))) r"%simages/newpages\.png" % rurl))
starter = indirectStarter()
help = 'Index format: chapter-number' help = 'Index format: chapter-number'
def getIndexStripUrl(self, index): def getIndexStripUrl(self, index):

View file

@ -45,7 +45,7 @@ class WayfarersMoon(_BasicScraper):
class WebDesignerCOTW(_BasicScraper): class WebDesignerCOTW(_BasicScraper):
url = 'http://www.webdesignerdepot.com/' url = 'http://www.webdesignerdepot.com/'
rurl = escape(url) rurl = escape(url)
starter = indirectStarter(url, compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+/)' % rurl))) starter = indirectStarter()
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2009/11/comics-of-the-week-1' firstStripUrl = stripUrl % '2009/11/comics-of-the-week-1'
imageSearch = ( imageSearch = (
@ -57,6 +57,7 @@ class WebDesignerCOTW(_BasicScraper):
multipleImagesPerStrip = True multipleImagesPerStrip = True
prevSearch = compile(tagre("link", "href", r"(%s\d+/\d+/[^']+)" % rurl, prevSearch = compile(tagre("link", "href", r"(%s\d+/\d+/[^']+)" % rurl,
before='prev', quote="'")) before='prev', quote="'"))
latestSearch = compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+/)' % rurl))
help = 'Index format: yyyy/mm/stripname' help = 'Index format: yyyy/mm/stripname'
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):

View file

@ -4,13 +4,13 @@ from ..scraper import make_scraper
from .common import _WordPressScraper from .common import _WordPressScraper
def add(name, url, starter=None): def add(name, start):
attrs = dict( attrs = dict(
name=name, name=name,
url=url url='http://hijinksensue.com/',
latestSearch=start,
starter=indirectStarter()
) )
if starter:
attrs['starter'] = starter
globals()[name] = make_scraper(name, _WordPressScraper, **attrs) globals()[name] = make_scraper(name, _WordPressScraper, **attrs)
@ -22,4 +22,4 @@ for (name, starterXPath) in [
('HijinksEnsueConvention', '//h4[text()="Latest Fancy Convention Sketches"]/..//a'), ('HijinksEnsueConvention', '//h4[text()="Latest Fancy Convention Sketches"]/..//a'),
('HijinksEnsuePhoto', '//h4[text()="Latest Fancy Photo Comic"]/..//a') ('HijinksEnsuePhoto', '//h4[text()="Latest Fancy Photo Comic"]/..//a')
]: ]:
add(name, 'http://hijinksensue.com/', starter=indirectStarter('http://hijinksensue.com/', starterXPath)) add(name, starterXPath)

View file

@ -1,6 +1,9 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile from re import compile
@ -12,13 +15,13 @@ from ..util import tagre
class Xkcd(_BasicScraper): class Xkcd(_BasicScraper):
name = 'xkcd' name = 'xkcd'
url = 'http://xkcd.com/' url = 'http://xkcd.com/'
starter = bounceStarter(url, compile(tagre("a", "href", r'(/\d+/)', starter = bounceStarter()
before="next")))
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", imageSearch = compile(tagre("img", "src",
r'(//imgs\.xkcd\.com/comics/[^"]+)')) r'(//imgs\.xkcd\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/\d+/)', before="prev")) prevSearch = compile(tagre("a", "href", r'(/\d+/)', before="prev"))
nextSearch = compile(tagre("a", "href", r'(/\d+/)', before="next"))
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
textSearch = compile(tagre("img", "title", r'([^"]+)', textSearch = compile(tagre("img", "title", r'([^"]+)',
before=r'//imgs\.xkcd\.com/comics/')) before=r'//imgs\.xkcd\.com/comics/'))

View file

@ -22,15 +22,16 @@ class ZapComic(_ParserScraper):
class Zapiro(_BasicScraper): class Zapiro(_BasicScraper):
url = 'http://www.mg.co.za/zapiro/' url = 'http://www.mg.co.za/zapiro/'
starter = bounceStarter( starter = bounceStarter()
url, compile(tagre("li", "class", r'nav_older') +
tagre("a", "href", r'(http://mg\.co\.za/cartoon/[^"]+)')))
stripUrl = 'http://mg.co.za/cartoon/%s' stripUrl = 'http://mg.co.za/cartoon/%s'
firstStripUrl = stripUrl % 'zapiro_681' firstStripUrl = stripUrl % 'zapiro_681'
imageSearch = compile(tagre("img", "src", r'(http://cdn\.mg\.co\.za/crop/content/cartoons/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://cdn\.mg\.co\.za/crop/content/cartoons/[^"]+)'))
prevSearch = compile(tagre("li", "class", r'nav_older') + prevSearch = compile(tagre("li", "class", r'nav_older') +
tagre("a", "href", tagre("a", "href",
r'(http://mg\.co\.za/cartoon/[^"]+)')) r'(http://mg\.co\.za/cartoon/[^"]+)'))
nextSearch = compile(tagre("li", "class", r'nav_older') +
tagre("a", "href",
r'(http://mg\.co\.za/cartoon/[^"]+)'))
help = 'Index format: yyyy-mm-dd-stripname' help = 'Index format: yyyy-mm-dd-stripname'
@classmethod @classmethod