Read starter parameters from class.

This allows to specify starters in a more declarative and dynamic way.
This commit is contained in:
Tobias Gruetzmacher 2016-04-12 23:11:39 +02:00
parent b865a171f9
commit 42e43fa4e6
23 changed files with 186 additions and 140 deletions

View file

@ -1,8 +1,13 @@
# -*- coding: iso-8859-1 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from .util import getQueryParams
def queryNamer(paramName, usePageUrl=False):
"""Get name from URL query part."""
@classmethod
@ -25,23 +30,32 @@ def regexNamer(regex, usePageUrl=False):
return _namer
def bounceStarter(url, nextSearch):
"""Get start URL by "bouncing" back and forth one time."""
def bounceStarter():
"""Get start URL by "bouncing" back and forth one time.
This needs the url and nextSearch properties be defined on the class.
"""
@classmethod
def _starter(cls):
"""Get bounced start URL."""
data = cls.getPage(url)
url1 = cls.fetchUrl(url, data, cls.prevSearch)
data = cls.getPage(cls.url)
url1 = cls.fetchUrl(cls.url, data, cls.prevSearch)
data = cls.getPage(url1)
return cls.fetchUrl(url1, data, nextSearch)
return cls.fetchUrl(url1, data, cls.nextSearch)
return _starter
def indirectStarter(url, latestSearch):
"""Get start URL by indirection."""
def indirectStarter():
"""Get start URL by indirection.
This is useful for comics where the latest comic can't be reached at a
stable URL. If the class has an attribute 'startUrl', this page is fetched
first, otherwise the page at 'url' is fetched. After that, the attribute
'latestSearch' is used on the page content to find the latest strip."""
@classmethod
def _starter(cls):
"""Get indirect start URL."""
url = cls.startUrl if hasattr(cls, "startUrl") else cls.url
data = cls.getPage(url)
return cls.fetchUrl(url, data, latestSearch)
return cls.fetchUrl(url, data, cls.latestSearch)
return _starter

View file

@ -16,8 +16,7 @@ from .common import _WordPressScraper, xpath_class, WP_LATEST_SEARCH
class AbstruseGoose(_BasicScraper):
url = 'http://abstrusegoose.com/'
rurl = escape(url)
starter = bounceStarter(
url, compile(tagre('a', 'href', r'(%s\d+)' % rurl) + "Next »"))
starter = bounceStarter()
stripUrl = url + '%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre('img', 'src',
@ -81,7 +80,6 @@ class AfterStrife(_WordPressScraper):
class AGirlAndHerFed(_BasicScraper):
url = 'http://www.agirlandherfed.com/'
starter = bounceStarter(url, compile(r'<a href="([^"]+)">[^>]+Back'))
stripUrl = url + '1.%s.html'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(img/strip/[^"]+\.jpg)'))
@ -114,7 +112,6 @@ class ALessonIsLearned(_BasicScraper):
url = 'http://www.alessonislearned.com/'
prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=\d+)",
quote="'") + r"[^>]+previous")
starter = indirectStarter(url, prevSearch)
stripUrl = url + 'index.php?comic=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r"(cmx/lesson\d+\.[a-z]+)"))
@ -124,8 +121,8 @@ class ALessonIsLearned(_BasicScraper):
class Alice(_WordPressScraper):
url = 'http://www.alicecomics.com/'
prevSearch = '//a[%s]' % xpath_class('navi-prev-in')
starter = indirectStarter('http://www.alicecomics.com/',
'//a[text()="Latest Alice!"]')
latestSearch = '//a[text()="Latest Alice!"]'
starter = indirectStarter()
class AlienLovesPredator(_BasicScraper):
@ -264,7 +261,8 @@ class ARedTailsDream(_BasicScraper):
class Ashes(_WordPressScraper):
url = 'http://www.flowerlarkstudios.com/comic/prologue/10232009/'
firstStripUrl = url
starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH)
latestSearch = WP_LATEST_SEARCH
starter = indirectStarter()
class ASkeweredParadise(_BasicScraper):
@ -289,12 +287,13 @@ class ASofterWorld(_ParserScraper):
class AstronomyPOTD(_ParserScraper):
baseUrl = 'http://apod.nasa.gov/apod/'
url = baseUrl + 'astropix.html'
starter = bounceStarter(url, '//a[text()=">"]')
starter = bounceStarter()
stripUrl = baseUrl + 'ap%s.html'
firstStripUrl = stripUrl % '061012'
imageSearch = '//a/img'
multipleImagesPerStrip = True
prevSearch = '//a[text()="<"]'
nextSearch = '//a[text()=">"]'
help = 'Index format: yymmdd'
def shouldSkipUrl(self, url, data):

View file

@ -47,8 +47,8 @@ class BalderDash(_ComicControlScraper):
class Bardsworth(_WordPressScraper):
url = 'http://www.bardsworth.com/'
starter = indirectStarter('http://www.bardsworth.com/',
'//a[@rel="bookmark"]')
latestSearch = '//a[@rel="bookmark"]'
starter = indirectStarter()
class Baroquen(_BasicScraper):
@ -72,12 +72,15 @@ class Beetlebum(_BasicScraper):
rurl = escape(url)
stripUrl = url + '%s'
firstStripUrl = stripUrl % '2006/03/10/quiz-fur-ruskiphile'
starter = indirectStarter(url, compile(tagre('a', 'href', r'(%s\d{4}/\d{2}/\d{2}/[^"]+)' % rurl, after='bookmark')))
starter = indirectStarter()
multipleImagesPerStrip = True
imageSearch = compile(tagre('img', 'src', r'(http://blog\.beetlebum\.de/wp-content/uploads/[^"]+)'))
prevSearch = compile(tagre('a', 'href',
r'(%s\d{4}/\d{2}/\d{2}/[^"]*)' % rurl,
after='prev'))
latestSearch = compile(tagre('a', 'href',
r'(%s\d{4}/\d{2}/\d{2}/[^"]+)' % rurl,
after='bookmark'))
help = 'Index format: yyyy/mm/dd/striptitle'
lang = 'de'
@ -223,7 +226,8 @@ class BoredAndEvil(_BasicScraper):
firstStripUrl = stripUrl % '2004-06-07'
imageSearch = compile(tagre("img", "src", r'(strips/[^"]+)'))
prevSearch = compile(r'First Comic.+<a href="(.+?)".+previous-on.gif')
starter = indirectStarter(url, prevSearch)
latestSearch = prevSearch
starter = indirectStarter()
help = 'Index format: yyyy-mm-dd'

View file

@ -55,7 +55,7 @@ class Carciphona(_BasicScraper):
after="prevarea"))
latestSearch = compile(tagre("a", "href",
r'(view\.php\?page=[0-9]+[^"]*)'))
starter = indirectStarter(url, latestSearch)
starter = indirectStarter()
@classmethod
def namer(cls, imageUrl, pageUrl):
@ -275,10 +275,11 @@ class CoolCatStudio(_BasicScraper):
class CorydonCafe(_ParserScraper):
url = 'http://corydoncafe.com/'
starter = indirectStarter(url, '//ul//a')
starter = indirectStarter()
stripUrl = url + '%s.php'
imageSearch = "//center[2]//img"
prevSearch = '//a[@title="prev"]'
latestSearch = '//ul//a'
help = 'Index format: yyyy/stripname'
@classmethod
@ -345,14 +346,15 @@ class CucumberQuest(_BasicScraper):
rurl = escape(url)
stripUrl = url + 'cq/%s/'
firstStripUrl = stripUrl % 'page-1'
starter = indirectStarter(url + 'recent.html',
compile(r'window\.location="(/cq/[^"]+/)"'))
startUrl = url + 'recent.html'
starter = indirectStarter()
imageSearch = (
compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/\d+[^"]+)' % rurl)),
compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/ch\d+[^"]+)' % rurl)),
compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/bonus[^"]+)' % rurl)),
)
prevSearch = compile(tagre("a", "href", r'(%scq/[^"]+/)' % rurl, after="previous"))
latestSearch = compile(r'window\.location="(/cq/[^"]+/)"')
help = 'Index format: stripname'
@ -377,11 +379,12 @@ class Curvy(_ParserScraper):
class CyanideAndHappiness(_BasicScraper):
url = 'http://www.explosm.net/comics/'
starter = bounceStarter(url, compile(tagre("a", "href", r"(/comics/\d+/)", after="next-comic")))
starter = bounceStarter()
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '15'
imageSearch = compile(tagre("img", "src", r'(//files.explosm.net/comics/[^"]+)', before="main-comic"))
prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', after="previous-comic"))
nextSearch = compile(tagre("a", "href", r"(/comics/\d+/)", after="next-comic"))
help = 'Index format: n (unpadded)'
def shouldSkipUrl(self, url, data):

View file

@ -13,17 +13,12 @@ from ..util import tagre
from .common import _WordPressScraper, xpath_class
class DailyDose(_BasicScraper):
class DailyDose(_ParserScraper):
url = 'http://dailydoseofcomics.com/'
starter = indirectStarter(
url, compile(tagre("a", "href",
r'(http://dailydoseofcomics\.com/[^"]+)',
after="preview")))
stripUrl = url + '%s/'
imageSearch = compile(tagre("img", "src", r'([^"]+)',
before="align(?:none|center)"))
prevSearch = compile(tagre("a", "href", r'(http://dailydoseofcomics\.com/[^"]+)', after="prev"))
help = 'Index format: stripname'
starter = indirectStarter()
imageSearch = '//p/a/img'
prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[@rel="bookmark"]'
class DamnLol(_BasicScraper):
@ -31,13 +26,13 @@ class DamnLol(_BasicScraper):
rurl = escape(url)
stripUrl = url + '%s.html'
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
nextSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="next"))
imageSearch = (
compile(tagre("img", "src", r'(%si/[^"]+)' % rurl)),
compile(tagre("img", "src", r'(%spics/[^"]+)' % rurl)),
)
help = 'Index format: stripname-number'
starter = bounceStarter(
url, compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="next")))
starter = bounceStarter()
@classmethod
def namer(cls, imageUrl, pageUrl):
@ -160,9 +155,12 @@ class Dilbert(_BasicScraper):
url = 'http://dilbert.com/'
stripUrl = url + '/strip/%s/'
firstStripUrl = stripUrl % '1989-04-16'
starter = indirectStarter(url, compile(tagre("a", "href", r'(http://dilbert.com/strip/[0-9-]*)', after="Click to see")))
starter = indirectStarter()
prevSearch = compile(tagre("a", "href", r'(/strip/\d+-\d+-\d+)', after="Older Strip"))
imageSearch = compile(tagre("img", "src", r'(http://assets.amuniversal.com/\w+)'))
latestSearch = compile(tagre("a", "href",
r'(http://dilbert.com/strip/[0-9-]*)',
after="Click to see"))
help = 'Index format: yyyy-mm-dd'
@classmethod
@ -254,9 +252,10 @@ class DresdenCodak(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl) +
tagre("img", "src", r"%sm_prev2?\.png" % rurl,
quote=""))
starter = indirectStarter(
url, compile(tagre("div", "id", "preview") +
tagre("a", "href", r'(%s\d+/\d+/\d+/[^"]+)' % rurl)))
latestSearch = compile(tagre("div", "id", "preview") +
tagre("a", "href",
r'(%s\d+/\d+/\d+/[^"]+)' % rurl))
starter = indirectStarter()
class DrFun(_BasicScraper):

View file

@ -15,9 +15,10 @@ from .common import _WordPressScraper, WP_LATEST_SEARCH, xpath_class
class EarthsongSaga(_ParserScraper):
url = 'http://earthsongsaga.com/index.php'
starter = indirectStarter(url, '//div[@id="leftmenu"]/span[1]/a[1]')
starter = indirectStarter()
imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[@title="Previous"]'
latestSearch = '//div[@id="leftmenu"]/span[1]/a[1]'
@classmethod
def fetchUrls(cls, url, data, urlSearch):
@ -43,21 +44,23 @@ class EarthsongSaga(_ParserScraper):
class EasilyAmused(_WordPressScraper):
url = 'http://www.flowerlarkstudios.com/comic/college-daze/ea01/'
firstStripUrl = url
starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH)
latestSearch = WP_LATEST_SEARCH
starter = indirectStarter()
class EatLiver(_BasicScraper):
url = 'http://www.eatliver.com/'
rurl = escape(url)
starter = indirectStarter(url, compile(
tagre("a", "href", r'(i\.php\?n=\d+)') +
tagre("img", "src", r'img/small/[^"]+') + r"</a>\s*<br"))
starter = indirectStarter()
stripUrl = url + "i.php?n=%s"
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("link", "href", r'(%simg/\d+/[^"]+)' % rurl,
before="image_src"))
prevSearch = compile(tagre("a", "href", r'(i\.php\?n=\d+)') +
"&#060;&#060; Previous")
latestSearch = compile(tagre("a", "href", r'(i\.php\?n=\d+)') +
tagre("img", "src", r'img/small/[^"]+') +
r"</a>\s*<br")
class EatThatToast(_BasicScraper):
@ -181,7 +184,8 @@ class Erstwhile(_WordPressScraper):
class Eryl(_WordPressScraper):
url = 'http://www.flowerlarkstudios.com/comic/prologue-migration/page-i/'
firstStripUrl = url
starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH)
latestSearch = WP_LATEST_SEARCH
starter = indirectStarter()
help = 'This was known as DarkWings in previous Dosage versions'

View file

@ -26,10 +26,9 @@ class FantasyRealms(_BasicScraper):
stripUrl = url + 'manga/%s.php'
imageSearch = compile(r'<img src="(\d{1,4}.\w{3,4})" width="540"', IGNORECASE)
prevSearch = compile(r'<a href="(.+?)"><img src="../images/nav-back.gif"', IGNORECASE)
latestSearch = compile(r'<a href="(manga/.+?)"><img src="preview.jpg"', IGNORECASE)
help = 'Index format: nnn'
starter = indirectStarter(
url,
compile(r'<a href="(manga/.+?)"><img src="preview.jpg"', IGNORECASE))
starter = indirectStarter()
class FauxPas(_BasicScraper):
@ -47,8 +46,9 @@ class FeyWinds(_BasicScraper):
stripUrl = baseUrl + 'comic/page.php?id=%s'
imageSearch = compile(r"(../comic/pages//.+?)'")
prevSearch = compile(r"(page.php\?id=.+?)'.+?navprevious.png")
latestSearch = compile(r'(comic/page.php\?id.+?)"')
help = 'Index format: n (unpadded)'
starter = indirectStarter(url, compile(r'(comic/page.php\?id.+?)"'))
starter = indirectStarter()
class FilibusterCartoons(_BasicScraper):
@ -159,9 +159,9 @@ class FredoAndPidjin(_BasicScraper):
)
multipleImagesPerStrip = True
prevSearch = compile(tagre('a', 'href', '([^"]+)') + "Prev</a>")
starter = indirectStarter(
url,
compile(tagre('a', 'href', "(" + url + r'\d\d\d\d/\d\d/\d\d/[^"]+/)')))
latestSearch = compile(tagre('a', 'href', "(" + url +
r'\d\d\d\d/\d\d/\d\d/[^"]+/)'))
starter = indirectStarter()
class Freefall(_BasicScraper):

View file

@ -27,15 +27,15 @@ class Garanos(_BasicScraper):
baseUrl = 'http://garanos.alexheberling.com/'
rurl = escape(baseUrl)
url = baseUrl + 'pages/page-1/'
starter = indirectStarter(
url, compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
after="nav-last")))
starter = indirectStarter()
stripUrl = baseUrl + 'pages/page-%s'
imageSearch = compile(
tagre("img", "src",
r'(%swp-content/uploads/sites/\d+/\d+/\d+/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
after="prev"))
latestSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
after="nav-last"))
help = 'Index format: n (unpadded)'
@ -136,14 +136,14 @@ class GoGetARoomie(_ComicControlScraper):
class GoneWithTheBlastwave(_BasicScraper):
url = 'http://www.blastwave-comic.com/index.php?p=comic&nro=1'
starter = indirectStarter(
url, compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' +
r'<img src="images/page/default/latest'))
starter = indirectStarter()
stripUrl = url[:-1] + '%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(r'<img.+src=".+(/comics/.+?)"')
prevSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' +
r'<img src="images/page/default/previous')
latestSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' +
r'<img src="images/page/default/latest')
help = 'Index format: n'
@classmethod

View file

@ -41,15 +41,15 @@ class _HappyJar(_WordPressScraper):
class HarkAVagrant(_BasicScraper):
url = 'http://www.harkavagrant.com/'
rurl = escape(url)
starter = bounceStarter(
url, compile(tagre("a", "href", r'(%sindex\.php\?id=\d+)' % rurl) +
tagre("img", "src", "buttonnext.png")))
starter = bounceStarter()
stripUrl = url + 'index.php?id=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(%s[^"]+)' % rurl,
after='BORDER'))
prevSearch = compile(tagre("a", "href", r'(%sindex\.php\?id=\d+)' % rurl) +
tagre("img", "src", "buttonprevious.png"))
nextSearch = compile(tagre("a", "href", r'(%sindex\.php\?id=\d+)' % rurl) +
tagre("img", "src", "buttonnext.png"))
help = 'Index format: number'
@classmethod

View file

@ -1,8 +1,12 @@
# -*- coding: iso-8859-1 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile, escape
from ..scraper import _BasicScraper
from ..util import tagre
from ..helpers import indirectStarter
@ -30,9 +34,12 @@ class JerkCity(_BasicScraper):
class JimBenton(_BasicScraper):
url = 'http://www.jimbenton.com/page14/page14.html'
stripUrl = 'http://www.jimbenton.com/page14/files/JimBentonComic-%s.html'
starter = indirectStarter(url, compile(tagre("a", "href", r'(files/JimBentonComic-[^>]+\.html)', quote="")))
imageSearch = compile(tagre("img", "src", r'(JimBentonComic-[^"]+)', before="photo-frame"))
prevSearch = compile(tagre("a", "href", r'(JimBentonComic-[^>]+\.html)', quote="") + "Next")
starter = indirectStarter()
imageSearch = compile(tagre("img", "src", r'(JimBentonComic-[^"]+)',
before="photo-frame"))
prevSearch = compile(tagre("a", "href", r'(JimBentonComic-[^>]+\.html)',
quote="") + "Next")
latestSearch = compile(tagre("a", "href", r'(files/JimBentonComic-[^>]+\.html)', quote=""))
help = 'Index format: stripname'
@ -58,6 +65,7 @@ class JustAnotherEscape(_BasicScraper):
rurl = escape(url)
stripUrl = url + 'index.cgi?date=%s'
imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s/index\.cgi\?date=\d+)' % rurl)
+ tagre("img", "alt", "Previous Comic"))
prevSearch = compile(tagre("a", "href",
r'(%s/index\.cgi\?date=\d+)' % rurl) +
tagre("img", "alt", "Previous Comic"))
help = 'Index format: yyyymmdd'

View file

@ -9,7 +9,6 @@ from re import compile, escape, IGNORECASE
from ..scraper import _BasicScraper
from ..util import tagre
from ..helpers import indirectStarter
from .common import _ComicControlScraper, _WordPressScraper, xpath_class
@ -81,4 +80,3 @@ class KuroShouri(_BasicScraper):
tagre("a", "href", r'(%s\?webcomic_post\=[^"]+)' % rurl,
after="previous"))
help = 'Index format: chapter-n-page-m'
starter = indirectStarter(url, prevSearch)

View file

@ -21,10 +21,10 @@ class Lackadaisy(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(http://www\.lackadaisycats\.com/comic/[^"]*)'))
prevSearch = compile(tagre("a", "href", r"(/comic\.php\?comicid=[0-9]+)") +
"&lt; Previous")
nextSearch = compile(tagre("a", "href", r"(/comic.php\?comicid=[0-9]+)") +
"Next")
help = 'Index format: n'
starter = bounceStarter(
url, compile(tagre("a", "href", r"(/comic.php\?comicid=[0-9]+)") +
"Next"))
starter = bounceStarter()
@classmethod
def namer(cls, imageUrl, pageUrl):
@ -37,7 +37,8 @@ class Lackadaisy(_BasicScraper):
class Laiyu(_WordPressScraper):
url = 'http://www.flowerlarkstudios.com/comic/preliminary-concepts/welcome/'
firstStripUrl = url
starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH)
latestSearch = WP_LATEST_SEARCH
starter = indirectStarter()
class LasLindas(_BasicScraper):
@ -64,9 +65,9 @@ class LeastICouldDo(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/\d{8,9}\.\w{1,4})' % rurl))
prevSearch = compile(tagre("a", "href", r'(%scomic/\d+/)' % rurl,
after="Previous"))
starter = indirectStarter(
url, compile(tagre("a", "href", r'(%scomic/\d+/)' % rurl,
after="feature-comic")))
latestSearch = compile(tagre("a", "href", r'(%scomic/\d+/)' % rurl,
after="feature-comic"))
starter = indirectStarter()
help = 'Index format: yyyymmdd'
@ -110,12 +111,11 @@ class LoadingArtist(_ParserScraper):
class LookingForGroup(_ParserScraper):
url = 'http://www.lfgcomic.com/'
rurl = escape(url)
stripUrl = url + 'page/%s/'
firstStripUrl = stripUrl % '1'
css = True
imageSearch = '#comic img'
prevSearch = '#comic-left > a'
starter = indirectStarter(url, '#header-dropdown-comic-lfg > a:nth-of-type(2)')
nameSearch = compile(r'/page/([-0-9]+)/')
latestSearch = '#header-dropdown-comic-lfg > a:nth-of-type(2)'
starter = indirectStarter()
help = 'Index format: nnn'

View file

@ -102,9 +102,9 @@ class NichtLustig(_BasicScraper):
lang = 'de'
imageSearch = compile('background-image:url\((http://static\.nichtlustig\.de/comics/full/\d+\.jpg)')
prevSearch = compile(tagre("a", "href", r'(http://static\.nichtlustig\.de/toondb/\d+\.html)'))
latestSearch = compile(tagre("a", "href", r'([^"]*toondb/\d+\.html)'))
help = 'Index format: yymmdd'
starter = indirectStarter(
url, compile(tagre("a", "href", r'([^"]*toondb/\d+\.html)')))
starter = indirectStarter()
class Nicky510(_WordPressScraper):
@ -136,7 +136,8 @@ class NobodyScores(_BasicScraper):
class NoMoreSavePoints(_WordPressScraper):
url = 'http://www.flowerlarkstudios.com/comic/no-more-save-points/mushroom-hopping/'
firstStripUrl = url
starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH)
latestSearch = WP_LATEST_SEARCH
starter = indirectStarter()
class NoNeedForBushido(_BasicScraper):
@ -149,10 +150,10 @@ class NoNeedForBushido(_BasicScraper):
after="attachment-full"))
prevSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl,
after="previous-webcomic"))
latestSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl,
after="last-webcomic"))
help = 'Index format: nnn'
starter = indirectStarter(
url, compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl,
after="last-webcomic")))
starter = indirectStarter()
class NotInventedHere(_BasicScraper):

View file

@ -8,7 +8,6 @@ from __future__ import absolute_import, division, print_function
from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter
from ..util import tagre
from .common import _WordPressScraper, xpath_class
@ -53,7 +52,6 @@ class OkCancel(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(%sstrips/okcancel\d{8}\.gif)' % rurl))
prevSearch = compile(tagre("div", "class", "previous") +
tagre("a", "href", r'(%scomic/\d{1,4}\.html)' % rurl))
starter = indirectStarter(url, prevSearch)
help = 'Index format: yyyymmdd'

View file

@ -20,12 +20,13 @@ class PandyLand(_WordPressScraper):
class ParadigmShift(_BasicScraper):
url = 'http://www.paradigmshiftmanga.com/'
starter = indirectStarter(url, compile(tagre("a", "href", r'([^"]+)',
after="next-comic-link")))
starter = indirectStarter()
stripUrl = url + 'ps/%s.html'
imageSearch = compile(tagre("img", "src", r'([^"]*comics/ps/[^"]*)'))
prevSearch = compile(tagre("a", "href", r'([^"]+)',
after="previous-comic-link"))
latestSearch = compile(tagre("a", "href", r'([^"]+)',
after="next-comic-link"))
help = 'Index format: custom'
@ -72,7 +73,6 @@ class PennyAndAggie(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(http://www\.pennyandaggie\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r"(index\.php\?p\=\d+)", quote="'") +
tagre("img", "src", r'%simages/previous_day\.gif' % rurl, quote=""))
starter = indirectStarter(url, prevSearch)
help = 'Index format: n (unpadded)'
@ -162,11 +162,12 @@ class PicPakDog(_BasicScraper):
class PiledHigherAndDeeper(_BasicScraper):
url = 'http://www.phdcomics.com/comics.php'
starter = bounceStarter(url, compile(r'<a href=(archive\.php\?comicid=\d+)>.*<img [^>]*next_button\.gif'))
starter = bounceStarter()
stripUrl = url + '?comicid=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd\d+s\d?\.\w{3,4})', quote=""))
prevSearch = compile(r'<a href=((comics/)?archive\.php\?comicid=\d+)>.*<img [^>]*prev_button\.gif')
nextSearch = compile(r'<a href=(archive\.php\?comicid=\d+)>.*<img [^>]*next_button\.gif')
help = 'Index format: n (unpadded)'
namer = queryNamer('comicid', usePageUrl=True)
@ -204,9 +205,9 @@ class PokeyThePenguin(_ParserScraper):
stripUrl = url + 'index%s.html'
firstStripUrl = stripUrl % '1'
imageSearch = '//p/img'
prevSearch = True
latestSearch = '(//a)[last()]'
multipleImagesPerStrip = True
starter = indirectStarter(url, "(//a)[last()]")
starter = indirectStarter()
help = 'Index format: number'
def getPrevUrl(self, url, data):
@ -230,22 +231,22 @@ class PoorlyDrawnLines(_BasicScraper):
class Precocious(_BasicScraper):
url = 'http://www.precociouscomic.com/'
starter = indirectStarter(
url, compile(tagre("a", "href", r'(/archive/comic/[^"]+)') +
tagre("img", "src", r"/templates/precocious_main/images/next_arrow\.png"))
)
starter = indirectStarter()
stripUrl = url + 'archive/comic/%s'
imageSearch = compile(tagre("img", "src", r'(/comics/\d+[^"]*\.(?:jpg|gif))'))
prevSearch = compile(tagre("a", "href", r'(/archive/comic/[^"]+)') + tagre("img", "src", r"/templates/precocious_main/images/back_arrow\.png"))
latestSearch = compile(tagre("a", "href", r'(/archive/comic/[^"]+)') +
tagre("img", "src", r"/templates/precocious_main/images/next_arrow\.png"))
help = 'Index format: yyyy/mm/dd'
class PS238(_ParserScraper):
url = 'http://ps238.nodwick.com/'
stripUrl = url + '/comic/%s/'
starter = bounceStarter(url, '//a[@class="comic-nav-base comic-nav-next"]')
starter = bounceStarter()
imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[@class="comic-nav-base comic-nav-previous"]'
nextSearch = '//a[@class="comic-nav-base comic-nav-next"]'
help = 'Index format: yyyy-mm-dd'

View file

@ -1,10 +1,13 @@
# -*- coding: iso-8859-1 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile, escape
from ..scraper import _BasicScraper
from ..scraper import _ParserScraper
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import bounceStarter
from ..util import tagre
@ -16,6 +19,7 @@ class RadioactivePanda(_BasicScraper):
prevSearch = compile(r'<a href="(/comic/.*?)".+?previous_btn')
help = 'Index format: n (no padding)'
class RalfTheDestroyer(_ParserScraper):
url = 'http://ralfthedestroyer.com/'
stripUrl = url + '%s/'
@ -47,12 +51,12 @@ class RealmOfAtland(_BasicScraper):
class RedMeat(_BasicScraper):
baseUrl = 'http://www.redmeat.com/redmeat/'
url = baseUrl + 'current/index.html'
starter = bounceStarter(url,
compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="next")))
starter = bounceStarter()
stripUrl = baseUrl + '%s/index.html'
firstStripUrl = stripUrl % '1996-06-10'
imageSearch = compile(tagre("img", "src", r'(http://www\.redmeat\.com/imager/b/redmeat/[^"]*\.png)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="prev"))
nextSearch = compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="next"))
help = 'Index format: yyyy-mm-dd'
@ -81,7 +85,8 @@ class RomanticallyApocalyptic(_BasicScraper):
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(%sart/\d+[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s\d+[^"]+)' % rurl)+"\s*"+tagre('span', 'class', 'spritePrevious'))
prevSearch = compile(tagre("a", "href", r'(%s\d+[^"]+)' % rurl) + "\s*" +
tagre('span', 'class', 'spritePrevious'))
help = 'Index format: n'
adult = True
@ -101,5 +106,6 @@ class Ruthe(_BasicScraper):
firstStripUrl = stripUrl % '1'
lang = 'de'
imageSearch = compile(tagre("img", "src", r'(/?cartoons/strip_\d+[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/cartoon/\d+/datum/asc/)')+'vorheriger')
prevSearch = compile(tagre("a", "href", r'(/cartoon/\d+/datum/asc/)') +
'vorheriger')
help = 'Index format: number'

View file

@ -69,9 +69,10 @@ class ScandinaviaAndTheWorld(_ParserScraper):
url = 'http://satwcomic.com/'
stripUrl = url + '%s'
firstStripUrl = stripUrl % 'sweden-denmark-and-norway'
starter = indirectStarter(url, '//a[text()="View latest comic"]')
starter = indirectStarter()
imageSearch = '//img[@itemprop="image"]'
prevSearch = '//a[@accesskey="p"]'
latestSearch = '//a[text()="View latest comic"]'
textSearch = '//span[@itemprop="articleBody"]'
help = 'Index format: stripname'
@ -194,9 +195,9 @@ class SexyLosers(_BasicScraper):
stripUrl = url + '%s.html'
imageSearch = compile(r'<img src\s*=\s*"\s*(comics/[\w\.]+?)"', IGNORECASE)
prevSearch = compile(r'<a href="(/\d{3}\.\w+?)"><font color = FFAAAA><<', IGNORECASE)
latestSearch = compile(r'SEXY LOSERS <A HREF="(.+?)">Latest SL Comic \(#\d+\)</A>', IGNORECASE)
help = 'Index format: nnn'
starter = indirectStarter(url,
compile(r'SEXY LOSERS <A HREF="(.+?)">Latest SL Comic \(#\d+\)</A>', IGNORECASE))
starter = indirectStarter()
@classmethod
def namer(cls, imageUrl, pageUrl):
@ -333,7 +334,8 @@ class SnowFlame(_WordPressScraper):
url = 'http://www.snowflamecomic.com/'
stripUrl = url + '?comic=snowflame-%s-%s'
firstStripUrl = stripUrl % ('01', '01')
starter = bounceStarter(url, WP_LATEST_SEARCH)
starter = bounceStarter()
nextSearch = WP_LATEST_SEARCH
help = 'Index format: chapter-page'
def getIndexStripUrl(self, index):
@ -392,8 +394,9 @@ class Spamusement(_BasicScraper):
imageSearch = compile(r'<img src="(%sgfx/\d+\..+?)"' % rurl, IGNORECASE)
prevSearch = compile(r'<a href="(%sindex.php/comics/view/.+?)">' % rurl,
IGNORECASE)
latestSearch = prevSearch
help = 'Index format: n (unpadded)'
starter = indirectStarter(url, prevSearch)
starter = indirectStarter()
class SpareParts(_BasicScraper):
@ -504,8 +507,7 @@ class StuffNoOneToldMe(_BasicScraper):
stripUrl = url + '%s.html'
firstStripUrl = stripUrl % '2010/05/01'
olderHref = r"(http://www\.snotm\.com/\d+/\d+/[^']+\.html)"
starter = indirectStarter(
url, compile(tagre("a", "href", olderHref, quote="'")))
starter = indirectStarter()
imageSearch = (
compile(tagre("img", "src", r'(http://i\.imgur\.com/[^"]+)') +
r"(?:</a>|<br />)"),
@ -515,6 +517,7 @@ class StuffNoOneToldMe(_BasicScraper):
)
prevSearch = compile(tagre("a", "href", olderHref, quote="'",
before="older-link"))
latestSearch = compile(tagre("a", "href", olderHref, quote="'"))
multipleImagesPerStrip = True
help = 'Index format: yyyy/mm/stripname'

View file

@ -76,14 +76,14 @@ class TheNoob(_BasicScraper):
class TheOrderOfTheStick(_BasicScraper):
baseUrl = 'http://www.giantitp.com/'
url = baseUrl + 'comics/oots0863.html'
stripUrl = baseUrl + 'comics/oots%s.html'
url = 'http://www.giantitp.com/'
stripUrl = url + 'comics/oots%s.html'
firstStripUrl = stripUrl % '0001'
imageSearch = compile(r'<IMG src="(/comics/images/[^"]+)">')
prevSearch = compile(r'<A href="(/comics/oots\d{4}\.html)"><IMG src="/Images/redesign/ComicNav_Back.gif"')
latestSearch = compile(r'<A href="(/comics/oots\d{4}\.html)"')
help = 'Index format: n (unpadded)'
starter = indirectStarter(baseUrl, compile(r'<A href="(/comics/oots\d{4}\.html)"'))
starter = indirectStarter()
@classmethod
def namer(cls, imageUrl, pageUrl):
@ -116,9 +116,13 @@ class TheThinHLine(_BasicScraper):
rurl = escape(url)
stripUrl = url + 'post/%s'
firstStripUrl = stripUrl % '3517345105'
imageSearch = compile(tagre('img', 'data-src', r'([^"]+media.tumblr.com/[^"]+)', before='content-image'))
imageSearch = compile(tagre('img', 'data-src',
r'([^"]+media.tumblr.com/[^"]+)',
before='content-image'))
prevSearch = compile(tagre("a", "href", r'([^"]+)') + '&gt;</a>')
starter = indirectStarter(url, compile(tagre("a", "href", r'([^"]+)', after='class="timestamp"')))
latestSearch = compile(tagre("a", "href", r'([^"]+)',
after='class="timestamp"'))
starter = indirectStarter()
adult = True
indirectImageSearch = compile(tagre('a', 'href', r'(%simage/\d+)' % rurl))

View file

@ -21,12 +21,10 @@ class Underling(_WordPressScraper):
class Undertow(_BasicScraper):
url = 'http://undertow.dreamshards.org/'
stripUrl = url + '%s'
imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)'))
prevSearch = compile(r'href="(.+?)".+?teynpoint')
help = 'Index format: good luck !'
starter = indirectStarter(url,
compile(r'href="(.+?)".+?Most recent page'))
latestSearch = compile(r'href="(.+?)".+?Most recent page')
starter = indirectStarter()
class UnicornJelly(_BasicScraper):
@ -46,9 +44,10 @@ class Unsounded(_BasicScraper):
rurl = escape(url)
imageSearch = compile(tagre("img", "src", r'(pageart/[^"]*)'))
prevSearch = compile(tagre("a", "href", r'([^"]*)', after='class="back'))
starter = indirectStarter(
url, compile(tagre("a", "href", r'(%scomic/[^"]*)' % rurl) +
tagre("img", "src", r"%simages/newpages\.png" % rurl)))
latestSearch = compile(tagre("a", "href", r'(%scomic/[^"]*)' % rurl) +
tagre("img", "src",
r"%simages/newpages\.png" % rurl))
starter = indirectStarter()
help = 'Index format: chapter-number'
def getIndexStripUrl(self, index):

View file

@ -45,7 +45,7 @@ class WayfarersMoon(_BasicScraper):
class WebDesignerCOTW(_BasicScraper):
url = 'http://www.webdesignerdepot.com/'
rurl = escape(url)
starter = indirectStarter(url, compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+/)' % rurl)))
starter = indirectStarter()
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2009/11/comics-of-the-week-1'
imageSearch = (
@ -57,6 +57,7 @@ class WebDesignerCOTW(_BasicScraper):
multipleImagesPerStrip = True
prevSearch = compile(tagre("link", "href", r"(%s\d+/\d+/[^']+)" % rurl,
before='prev', quote="'"))
latestSearch = compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+/)' % rurl))
help = 'Index format: yyyy/mm/stripname'
def shouldSkipUrl(self, url, data):

View file

@ -4,13 +4,13 @@ from ..scraper import make_scraper
from .common import _WordPressScraper
def add(name, url, starter=None):
def add(name, start):
attrs = dict(
name=name,
url=url
url='http://hijinksensue.com/',
latestSearch=start,
starter=indirectStarter()
)
if starter:
attrs['starter'] = starter
globals()[name] = make_scraper(name, _WordPressScraper, **attrs)
@ -22,4 +22,4 @@ for (name, starterXPath) in [
('HijinksEnsueConvention', '//h4[text()="Latest Fancy Convention Sketches"]/..//a'),
('HijinksEnsuePhoto', '//h4[text()="Latest Fancy Photo Comic"]/..//a')
]:
add(name, 'http://hijinksensue.com/', starter=indirectStarter('http://hijinksensue.com/', starterXPath))
add(name, starterXPath)

View file

@ -1,6 +1,9 @@
# -*- coding: iso-8859-1 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile
@ -12,13 +15,13 @@ from ..util import tagre
class Xkcd(_BasicScraper):
name = 'xkcd'
url = 'http://xkcd.com/'
starter = bounceStarter(url, compile(tagre("a", "href", r'(/\d+/)',
before="next")))
starter = bounceStarter()
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src",
r'(//imgs\.xkcd\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/\d+/)', before="prev"))
nextSearch = compile(tagre("a", "href", r'(/\d+/)', before="next"))
help = 'Index format: n (unpadded)'
textSearch = compile(tagre("img", "title", r'([^"]+)',
before=r'//imgs\.xkcd\.com/comics/'))

View file

@ -22,15 +22,16 @@ class ZapComic(_ParserScraper):
class Zapiro(_BasicScraper):
url = 'http://www.mg.co.za/zapiro/'
starter = bounceStarter(
url, compile(tagre("li", "class", r'nav_older') +
tagre("a", "href", r'(http://mg\.co\.za/cartoon/[^"]+)')))
starter = bounceStarter()
stripUrl = 'http://mg.co.za/cartoon/%s'
firstStripUrl = stripUrl % 'zapiro_681'
imageSearch = compile(tagre("img", "src", r'(http://cdn\.mg\.co\.za/crop/content/cartoons/[^"]+)'))
prevSearch = compile(tagre("li", "class", r'nav_older') +
tagre("a", "href",
r'(http://mg\.co\.za/cartoon/[^"]+)'))
nextSearch = compile(tagre("li", "class", r'nav_older') +
tagre("a", "href",
r'(http://mg\.co\.za/cartoon/[^"]+)'))
help = 'Index format: yyyy-mm-dd-stripname'
@classmethod