From 42e43fa4e64dff25b1bf887ac8626c9f35f80329 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 12 Apr 2016 23:11:39 +0200 Subject: [PATCH] Read starter parameters from class. This allows to specify starters in a more declarative and dynamic way. --- dosagelib/helpers.py | 32 +++++++++++++++++++++++--------- dosagelib/plugins/a.py | 15 +++++++-------- dosagelib/plugins/b.py | 12 ++++++++---- dosagelib/plugins/c.py | 13 ++++++++----- dosagelib/plugins/d.py | 31 +++++++++++++++---------------- dosagelib/plugins/e.py | 16 ++++++++++------ dosagelib/plugins/f.py | 14 +++++++------- dosagelib/plugins/g.py | 12 ++++++------ dosagelib/plugins/h.py | 6 +++--- dosagelib/plugins/j.py | 20 ++++++++++++++------ dosagelib/plugins/k.py | 2 -- dosagelib/plugins/l.py | 20 ++++++++++---------- dosagelib/plugins/n.py | 13 +++++++------ dosagelib/plugins/o.py | 2 -- dosagelib/plugins/p.py | 23 ++++++++++++----------- dosagelib/plugins/r.py | 20 +++++++++++++------- dosagelib/plugins/s.py | 17 ++++++++++------- dosagelib/plugins/t.py | 16 ++++++++++------ dosagelib/plugins/u.py | 13 ++++++------- dosagelib/plugins/w.py | 3 ++- dosagelib/plugins/wordpress.py | 10 +++++----- dosagelib/plugins/x.py | 9 ++++++--- dosagelib/plugins/z.py | 7 ++++--- 23 files changed, 186 insertions(+), 140 deletions(-) diff --git a/dosagelib/helpers.py b/dosagelib/helpers.py index 7b7a62940..2777891c2 100644 --- a/dosagelib/helpers.py +++ b/dosagelib/helpers.py @@ -1,8 +1,13 @@ -# -*- coding: iso-8859-1 -*- +# -*- coding: utf-8 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher + +from __future__ import absolute_import, division, print_function + from .util import getQueryParams + def queryNamer(paramName, usePageUrl=False): """Get name from URL query part.""" @classmethod @@ -25,23 +30,32 @@ def regexNamer(regex, usePageUrl=False): return _namer -def bounceStarter(url, nextSearch): - """Get start URL by "bouncing" back and forth one time.""" +def bounceStarter(): + """Get start URL by "bouncing" back and forth one time. + + This needs the url and nextSearch properties be defined on the class. + """ @classmethod def _starter(cls): """Get bounced start URL.""" - data = cls.getPage(url) - url1 = cls.fetchUrl(url, data, cls.prevSearch) + data = cls.getPage(cls.url) + url1 = cls.fetchUrl(cls.url, data, cls.prevSearch) data = cls.getPage(url1) - return cls.fetchUrl(url1, data, nextSearch) + return cls.fetchUrl(url1, data, cls.nextSearch) return _starter -def indirectStarter(url, latestSearch): - """Get start URL by indirection.""" +def indirectStarter(): + """Get start URL by indirection. + + This is useful for comics where the latest comic can't be reached at a + stable URL. If the class has an attribute 'startUrl', this page is fetched + first, otherwise the page at 'url' is fetched. After that, the attribute + 'latestSearch' is used on the page content to find the latest strip.""" @classmethod def _starter(cls): """Get indirect start URL.""" + url = cls.startUrl if hasattr(cls, "startUrl") else cls.url data = cls.getPage(url) - return cls.fetchUrl(url, data, latestSearch) + return cls.fetchUrl(url, data, cls.latestSearch) return _starter diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py index a08e1c6c4..141cd982b 100644 --- a/dosagelib/plugins/a.py +++ b/dosagelib/plugins/a.py @@ -16,8 +16,7 @@ from .common import _WordPressScraper, xpath_class, WP_LATEST_SEARCH class AbstruseGoose(_BasicScraper): url = 'http://abstrusegoose.com/' rurl = escape(url) - starter = bounceStarter( - url, compile(tagre('a', 'href', r'(%s\d+)' % rurl) + "Next »")) + starter = bounceStarter() stripUrl = url + '%s' firstStripUrl = stripUrl % '1' imageSearch = compile(tagre('img', 'src', @@ -81,7 +80,6 @@ class AfterStrife(_WordPressScraper): class AGirlAndHerFed(_BasicScraper): url = 'http://www.agirlandherfed.com/' - starter = bounceStarter(url, compile(r'[^>]+Back')) stripUrl = url + '1.%s.html' firstStripUrl = stripUrl % '1' imageSearch = compile(tagre("img", "src", r'(img/strip/[^"]+\.jpg)')) @@ -114,7 +112,6 @@ class ALessonIsLearned(_BasicScraper): url = 'http://www.alessonislearned.com/' prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=\d+)", quote="'") + r"[^>]+previous") - starter = indirectStarter(url, prevSearch) stripUrl = url + 'index.php?comic=%s' firstStripUrl = stripUrl % '1' imageSearch = compile(tagre("img", "src", r"(cmx/lesson\d+\.[a-z]+)")) @@ -124,8 +121,8 @@ class ALessonIsLearned(_BasicScraper): class Alice(_WordPressScraper): url = 'http://www.alicecomics.com/' prevSearch = '//a[%s]' % xpath_class('navi-prev-in') - starter = indirectStarter('http://www.alicecomics.com/', - '//a[text()="Latest Alice!"]') + latestSearch = '//a[text()="Latest Alice!"]' + starter = indirectStarter() class AlienLovesPredator(_BasicScraper): @@ -264,7 +261,8 @@ class ARedTailsDream(_BasicScraper): class Ashes(_WordPressScraper): url = 'http://www.flowerlarkstudios.com/comic/prologue/10232009/' firstStripUrl = url - starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH) + latestSearch = WP_LATEST_SEARCH + starter = indirectStarter() class ASkeweredParadise(_BasicScraper): @@ -289,12 +287,13 @@ class ASofterWorld(_ParserScraper): class AstronomyPOTD(_ParserScraper): baseUrl = 'http://apod.nasa.gov/apod/' url = baseUrl + 'astropix.html' - starter = bounceStarter(url, '//a[text()=">"]') + starter = bounceStarter() stripUrl = baseUrl + 'ap%s.html' firstStripUrl = stripUrl % '061012' imageSearch = '//a/img' multipleImagesPerStrip = True prevSearch = '//a[text()="<"]' + nextSearch = '//a[text()=">"]' help = 'Index format: yymmdd' def shouldSkipUrl(self, url, data): diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py index 349ef152c..ceb88f9ed 100644 --- a/dosagelib/plugins/b.py +++ b/dosagelib/plugins/b.py @@ -47,8 +47,8 @@ class BalderDash(_ComicControlScraper): class Bardsworth(_WordPressScraper): url = 'http://www.bardsworth.com/' - starter = indirectStarter('http://www.bardsworth.com/', - '//a[@rel="bookmark"]') + latestSearch = '//a[@rel="bookmark"]' + starter = indirectStarter() class Baroquen(_BasicScraper): @@ -72,12 +72,15 @@ class Beetlebum(_BasicScraper): rurl = escape(url) stripUrl = url + '%s' firstStripUrl = stripUrl % '2006/03/10/quiz-fur-ruskiphile' - starter = indirectStarter(url, compile(tagre('a', 'href', r'(%s\d{4}/\d{2}/\d{2}/[^"]+)' % rurl, after='bookmark'))) + starter = indirectStarter() multipleImagesPerStrip = True imageSearch = compile(tagre('img', 'src', r'(http://blog\.beetlebum\.de/wp-content/uploads/[^"]+)')) prevSearch = compile(tagre('a', 'href', r'(%s\d{4}/\d{2}/\d{2}/[^"]*)' % rurl, after='prev')) + latestSearch = compile(tagre('a', 'href', + r'(%s\d{4}/\d{2}/\d{2}/[^"]+)' % rurl, + after='bookmark')) help = 'Index format: yyyy/mm/dd/striptitle' lang = 'de' @@ -223,7 +226,8 @@ class BoredAndEvil(_BasicScraper): firstStripUrl = stripUrl % '2004-06-07' imageSearch = compile(tagre("img", "src", r'(strips/[^"]+)')) prevSearch = compile(r'First Comic.+\s*\s*") - starter = indirectStarter( - url, - compile(tagre('a', 'href', "(" + url + r'\d\d\d\d/\d\d/\d\d/[^"]+/)'))) + latestSearch = compile(tagre('a', 'href', "(" + url + + r'\d\d\d\d/\d\d/\d\d/[^"]+/)')) + starter = indirectStarter() class Freefall(_BasicScraper): diff --git a/dosagelib/plugins/g.py b/dosagelib/plugins/g.py index 675320db2..7f42d37b9 100644 --- a/dosagelib/plugins/g.py +++ b/dosagelib/plugins/g.py @@ -27,15 +27,15 @@ class Garanos(_BasicScraper): baseUrl = 'http://garanos.alexheberling.com/' rurl = escape(baseUrl) url = baseUrl + 'pages/page-1/' - starter = indirectStarter( - url, compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl, - after="nav-last"))) + starter = indirectStarter() stripUrl = baseUrl + 'pages/page-%s' imageSearch = compile( tagre("img", "src", r'(%swp-content/uploads/sites/\d+/\d+/\d+/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl, after="prev")) + latestSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl, + after="nav-last")) help = 'Index format: n (unpadded)' @@ -136,14 +136,14 @@ class GoGetARoomie(_ComicControlScraper): class GoneWithTheBlastwave(_BasicScraper): url = 'http://www.blastwave-comic.com/index.php?p=comic&nro=1' - starter = indirectStarter( - url, compile(r'href="(index.php\?p=comic&nro=\d+)">' + - r'' + r'' + + r']+\.html)', quote=""))) - imageSearch = compile(tagre("img", "src", r'(JimBentonComic-[^"]+)', before="photo-frame")) - prevSearch = compile(tagre("a", "href", r'(JimBentonComic-[^>]+\.html)', quote="") + "Next") + starter = indirectStarter() + imageSearch = compile(tagre("img", "src", r'(JimBentonComic-[^"]+)', + before="photo-frame")) + prevSearch = compile(tagre("a", "href", r'(JimBentonComic-[^>]+\.html)', + quote="") + "Next") + latestSearch = compile(tagre("a", "href", r'(files/JimBentonComic-[^>]+\.html)', quote="")) help = 'Index format: stripname' @@ -58,6 +65,7 @@ class JustAnotherEscape(_BasicScraper): rurl = escape(url) stripUrl = url + 'index.cgi?date=%s' imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) - prevSearch = compile(tagre("a", "href", r'(%s/index\.cgi\?date=\d+)' % rurl) - + tagre("img", "alt", "Previous Comic")) + prevSearch = compile(tagre("a", "href", + r'(%s/index\.cgi\?date=\d+)' % rurl) + + tagre("img", "alt", "Previous Comic")) help = 'Index format: yyyymmdd' diff --git a/dosagelib/plugins/k.py b/dosagelib/plugins/k.py index 6f8ce9d94..da7384833 100644 --- a/dosagelib/plugins/k.py +++ b/dosagelib/plugins/k.py @@ -9,7 +9,6 @@ from re import compile, escape, IGNORECASE from ..scraper import _BasicScraper from ..util import tagre -from ..helpers import indirectStarter from .common import _ComicControlScraper, _WordPressScraper, xpath_class @@ -81,4 +80,3 @@ class KuroShouri(_BasicScraper): tagre("a", "href", r'(%s\?webcomic_post\=[^"]+)' % rurl, after="previous")) help = 'Index format: chapter-n-page-m' - starter = indirectStarter(url, prevSearch) diff --git a/dosagelib/plugins/l.py b/dosagelib/plugins/l.py index 8e7d5dc0f..2d0f731ae 100644 --- a/dosagelib/plugins/l.py +++ b/dosagelib/plugins/l.py @@ -21,10 +21,10 @@ class Lackadaisy(_BasicScraper): imageSearch = compile(tagre("img", "src", r'(http://www\.lackadaisycats\.com/comic/[^"]*)')) prevSearch = compile(tagre("a", "href", r"(/comic\.php\?comicid=[0-9]+)") + "< Previous") + nextSearch = compile(tagre("a", "href", r"(/comic.php\?comicid=[0-9]+)") + + "Next") help = 'Index format: n' - starter = bounceStarter( - url, compile(tagre("a", "href", r"(/comic.php\?comicid=[0-9]+)") + - "Next")) + starter = bounceStarter() @classmethod def namer(cls, imageUrl, pageUrl): @@ -37,7 +37,8 @@ class Lackadaisy(_BasicScraper): class Laiyu(_WordPressScraper): url = 'http://www.flowerlarkstudios.com/comic/preliminary-concepts/welcome/' firstStripUrl = url - starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH) + latestSearch = WP_LATEST_SEARCH + starter = indirectStarter() class LasLindas(_BasicScraper): @@ -64,9 +65,9 @@ class LeastICouldDo(_BasicScraper): imageSearch = compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/\d{8,9}\.\w{1,4})' % rurl)) prevSearch = compile(tagre("a", "href", r'(%scomic/\d+/)' % rurl, after="Previous")) - starter = indirectStarter( - url, compile(tagre("a", "href", r'(%scomic/\d+/)' % rurl, - after="feature-comic"))) + latestSearch = compile(tagre("a", "href", r'(%scomic/\d+/)' % rurl, + after="feature-comic")) + starter = indirectStarter() help = 'Index format: yyyymmdd' @@ -110,12 +111,11 @@ class LoadingArtist(_ParserScraper): class LookingForGroup(_ParserScraper): url = 'http://www.lfgcomic.com/' - rurl = escape(url) stripUrl = url + 'page/%s/' firstStripUrl = stripUrl % '1' css = True imageSearch = '#comic img' prevSearch = '#comic-left > a' - starter = indirectStarter(url, '#header-dropdown-comic-lfg > a:nth-of-type(2)') - nameSearch = compile(r'/page/([-0-9]+)/') + latestSearch = '#header-dropdown-comic-lfg > a:nth-of-type(2)' + starter = indirectStarter() help = 'Index format: nnn' diff --git a/dosagelib/plugins/n.py b/dosagelib/plugins/n.py index e1b4bef97..d8b157ca4 100644 --- a/dosagelib/plugins/n.py +++ b/dosagelib/plugins/n.py @@ -102,9 +102,9 @@ class NichtLustig(_BasicScraper): lang = 'de' imageSearch = compile('background-image:url\((http://static\.nichtlustig\.de/comics/full/\d+\.jpg)') prevSearch = compile(tagre("a", "href", r'(http://static\.nichtlustig\.de/toondb/\d+\.html)')) + latestSearch = compile(tagre("a", "href", r'([^"]*toondb/\d+\.html)')) help = 'Index format: yymmdd' - starter = indirectStarter( - url, compile(tagre("a", "href", r'([^"]*toondb/\d+\.html)'))) + starter = indirectStarter() class Nicky510(_WordPressScraper): @@ -136,7 +136,8 @@ class NobodyScores(_BasicScraper): class NoMoreSavePoints(_WordPressScraper): url = 'http://www.flowerlarkstudios.com/comic/no-more-save-points/mushroom-hopping/' firstStripUrl = url - starter = indirectStarter(firstStripUrl, WP_LATEST_SEARCH) + latestSearch = WP_LATEST_SEARCH + starter = indirectStarter() class NoNeedForBushido(_BasicScraper): @@ -149,10 +150,10 @@ class NoNeedForBushido(_BasicScraper): after="attachment-full")) prevSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl, after="previous-webcomic")) + latestSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl, + after="last-webcomic")) help = 'Index format: nnn' - starter = indirectStarter( - url, compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl, - after="last-webcomic"))) + starter = indirectStarter() class NotInventedHere(_BasicScraper): diff --git a/dosagelib/plugins/o.py b/dosagelib/plugins/o.py index 2dc5ca973..c81fbe563 100644 --- a/dosagelib/plugins/o.py +++ b/dosagelib/plugins/o.py @@ -8,7 +8,6 @@ from __future__ import absolute_import, division, print_function from re import compile, escape from ..scraper import _BasicScraper, _ParserScraper -from ..helpers import indirectStarter from ..util import tagre from .common import _WordPressScraper, xpath_class @@ -53,7 +52,6 @@ class OkCancel(_BasicScraper): imageSearch = compile(tagre("img", "src", r'(%sstrips/okcancel\d{8}\.gif)' % rurl)) prevSearch = compile(tagre("div", "class", "previous") + tagre("a", "href", r'(%scomic/\d{1,4}\.html)' % rurl)) - starter = indirectStarter(url, prevSearch) help = 'Index format: yyyymmdd' diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index 7d3ac392e..bc407aafe 100755 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -20,12 +20,13 @@ class PandyLand(_WordPressScraper): class ParadigmShift(_BasicScraper): url = 'http://www.paradigmshiftmanga.com/' - starter = indirectStarter(url, compile(tagre("a", "href", r'([^"]+)', - after="next-comic-link"))) + starter = indirectStarter() stripUrl = url + 'ps/%s.html' imageSearch = compile(tagre("img", "src", r'([^"]*comics/ps/[^"]*)')) prevSearch = compile(tagre("a", "href", r'([^"]+)', after="previous-comic-link")) + latestSearch = compile(tagre("a", "href", r'([^"]+)', + after="next-comic-link")) help = 'Index format: custom' @@ -72,7 +73,6 @@ class PennyAndAggie(_BasicScraper): imageSearch = compile(tagre("img", "src", r'(http://www\.pennyandaggie\.com/comics/[^"]+)')) prevSearch = compile(tagre("a", "href", r"(index\.php\?p\=\d+)", quote="'") + tagre("img", "src", r'%simages/previous_day\.gif' % rurl, quote="")) - starter = indirectStarter(url, prevSearch) help = 'Index format: n (unpadded)' @@ -162,11 +162,12 @@ class PicPakDog(_BasicScraper): class PiledHigherAndDeeper(_BasicScraper): url = 'http://www.phdcomics.com/comics.php' - starter = bounceStarter(url, compile(r'.*]*next_button\.gif')) + starter = bounceStarter() stripUrl = url + '?comicid=%s' firstStripUrl = stripUrl % '1' imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd\d+s\d?\.\w{3,4})', quote="")) prevSearch = compile(r'.*]*prev_button\.gif') + nextSearch = compile(r'.*]*next_button\.gif') help = 'Index format: n (unpadded)' namer = queryNamer('comicid', usePageUrl=True) @@ -204,9 +205,9 @@ class PokeyThePenguin(_ParserScraper): stripUrl = url + 'index%s.html' firstStripUrl = stripUrl % '1' imageSearch = '//p/img' - prevSearch = True + latestSearch = '(//a)[last()]' multipleImagesPerStrip = True - starter = indirectStarter(url, "(//a)[last()]") + starter = indirectStarter() help = 'Index format: number' def getPrevUrl(self, url, data): @@ -230,22 +231,22 @@ class PoorlyDrawnLines(_BasicScraper): class Precocious(_BasicScraper): url = 'http://www.precociouscomic.com/' - starter = indirectStarter( - url, compile(tagre("a", "href", r'(/archive/comic/[^"]+)') + - tagre("img", "src", r"/templates/precocious_main/images/next_arrow\.png")) - ) + starter = indirectStarter() stripUrl = url + 'archive/comic/%s' imageSearch = compile(tagre("img", "src", r'(/comics/\d+[^"]*\.(?:jpg|gif))')) prevSearch = compile(tagre("a", "href", r'(/archive/comic/[^"]+)') + tagre("img", "src", r"/templates/precocious_main/images/back_arrow\.png")) + latestSearch = compile(tagre("a", "href", r'(/archive/comic/[^"]+)') + + tagre("img", "src", r"/templates/precocious_main/images/next_arrow\.png")) help = 'Index format: yyyy/mm/dd' class PS238(_ParserScraper): url = 'http://ps238.nodwick.com/' stripUrl = url + '/comic/%s/' - starter = bounceStarter(url, '//a[@class="comic-nav-base comic-nav-next"]') + starter = bounceStarter() imageSearch = '//div[@id="comic"]//img' prevSearch = '//a[@class="comic-nav-base comic-nav-previous"]' + nextSearch = '//a[@class="comic-nav-base comic-nav-next"]' help = 'Index format: yyyy-mm-dd' diff --git a/dosagelib/plugins/r.py b/dosagelib/plugins/r.py index 5980f673f..83b13446b 100644 --- a/dosagelib/plugins/r.py +++ b/dosagelib/plugins/r.py @@ -1,10 +1,13 @@ -# -*- coding: iso-8859-1 -*- +# -*- coding: utf-8 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher + +from __future__ import absolute_import, division, print_function from re import compile, escape -from ..scraper import _BasicScraper -from ..scraper import _ParserScraper + +from ..scraper import _BasicScraper, _ParserScraper from ..helpers import bounceStarter from ..util import tagre @@ -16,6 +19,7 @@ class RadioactivePanda(_BasicScraper): prevSearch = compile(r'<<', IGNORECASE) + latestSearch = compile(r'SEXY LOSERS Latest SL Comic \(#\d+\)', IGNORECASE) help = 'Index format: nnn' - starter = indirectStarter(url, - compile(r'SEXY LOSERS Latest SL Comic \(#\d+\)', IGNORECASE)) + starter = indirectStarter() @classmethod def namer(cls, imageUrl, pageUrl): @@ -333,7 +334,8 @@ class SnowFlame(_WordPressScraper): url = 'http://www.snowflamecomic.com/' stripUrl = url + '?comic=snowflame-%s-%s' firstStripUrl = stripUrl % ('01', '01') - starter = bounceStarter(url, WP_LATEST_SEARCH) + starter = bounceStarter() + nextSearch = WP_LATEST_SEARCH help = 'Index format: chapter-page' def getIndexStripUrl(self, index): @@ -392,8 +394,9 @@ class Spamusement(_BasicScraper): imageSearch = compile(r'' % rurl, IGNORECASE) + latestSearch = prevSearch help = 'Index format: n (unpadded)' - starter = indirectStarter(url, prevSearch) + starter = indirectStarter() class SpareParts(_BasicScraper): @@ -504,8 +507,7 @@ class StuffNoOneToldMe(_BasicScraper): stripUrl = url + '%s.html' firstStripUrl = stripUrl % '2010/05/01' olderHref = r"(http://www\.snotm\.com/\d+/\d+/[^']+\.html)" - starter = indirectStarter( - url, compile(tagre("a", "href", olderHref, quote="'"))) + starter = indirectStarter() imageSearch = ( compile(tagre("img", "src", r'(http://i\.imgur\.com/[^"]+)') + r"(?:|
)"), @@ -515,6 +517,7 @@ class StuffNoOneToldMe(_BasicScraper): ) prevSearch = compile(tagre("a", "href", olderHref, quote="'", before="older-link")) + latestSearch = compile(tagre("a", "href", olderHref, quote="'")) multipleImagesPerStrip = True help = 'Index format: yyyy/mm/stripname' diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py index c0c064ce0..c4cdf58cb 100755 --- a/dosagelib/plugins/t.py +++ b/dosagelib/plugins/t.py @@ -76,14 +76,14 @@ class TheNoob(_BasicScraper): class TheOrderOfTheStick(_BasicScraper): - baseUrl = 'http://www.giantitp.com/' - url = baseUrl + 'comics/oots0863.html' - stripUrl = baseUrl + 'comics/oots%s.html' + url = 'http://www.giantitp.com/' + stripUrl = url + 'comics/oots%s.html' firstStripUrl = stripUrl % '0001' imageSearch = compile(r'') prevSearch = compile(r'') - starter = indirectStarter(url, compile(tagre("a", "href", r'([^"]+)', after='class="timestamp"'))) + latestSearch = compile(tagre("a", "href", r'([^"]+)', + after='class="timestamp"')) + starter = indirectStarter() adult = True indirectImageSearch = compile(tagre('a', 'href', r'(%simage/\d+)' % rurl)) diff --git a/dosagelib/plugins/u.py b/dosagelib/plugins/u.py index 3feb65f0e..c1c4e87d5 100644 --- a/dosagelib/plugins/u.py +++ b/dosagelib/plugins/u.py @@ -21,12 +21,10 @@ class Underling(_WordPressScraper): class Undertow(_BasicScraper): url = 'http://undertow.dreamshards.org/' - stripUrl = url + '%s' imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)')) prevSearch = compile(r'href="(.+?)".+?teynpoint') - help = 'Index format: good luck !' - starter = indirectStarter(url, - compile(r'href="(.+?)".+?Most recent page')) + latestSearch = compile(r'href="(.+?)".+?Most recent page') + starter = indirectStarter() class UnicornJelly(_BasicScraper): @@ -46,9 +44,10 @@ class Unsounded(_BasicScraper): rurl = escape(url) imageSearch = compile(tagre("img", "src", r'(pageart/[^"]*)')) prevSearch = compile(tagre("a", "href", r'([^"]*)', after='class="back')) - starter = indirectStarter( - url, compile(tagre("a", "href", r'(%scomic/[^"]*)' % rurl) + - tagre("img", "src", r"%simages/newpages\.png" % rurl))) + latestSearch = compile(tagre("a", "href", r'(%scomic/[^"]*)' % rurl) + + tagre("img", "src", + r"%simages/newpages\.png" % rurl)) + starter = indirectStarter() help = 'Index format: chapter-number' def getIndexStripUrl(self, index): diff --git a/dosagelib/plugins/w.py b/dosagelib/plugins/w.py index a28f2bb8f..157e853f8 100644 --- a/dosagelib/plugins/w.py +++ b/dosagelib/plugins/w.py @@ -45,7 +45,7 @@ class WayfarersMoon(_BasicScraper): class WebDesignerCOTW(_BasicScraper): url = 'http://www.webdesignerdepot.com/' rurl = escape(url) - starter = indirectStarter(url, compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+/)' % rurl))) + starter = indirectStarter() stripUrl = url + '%s/' firstStripUrl = stripUrl % '2009/11/comics-of-the-week-1' imageSearch = ( @@ -57,6 +57,7 @@ class WebDesignerCOTW(_BasicScraper): multipleImagesPerStrip = True prevSearch = compile(tagre("link", "href", r"(%s\d+/\d+/[^']+)" % rurl, before='prev', quote="'")) + latestSearch = compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+/)' % rurl)) help = 'Index format: yyyy/mm/stripname' def shouldSkipUrl(self, url, data): diff --git a/dosagelib/plugins/wordpress.py b/dosagelib/plugins/wordpress.py index fd3665e9f..7683cb73b 100644 --- a/dosagelib/plugins/wordpress.py +++ b/dosagelib/plugins/wordpress.py @@ -4,13 +4,13 @@ from ..scraper import make_scraper from .common import _WordPressScraper -def add(name, url, starter=None): +def add(name, start): attrs = dict( name=name, - url=url + url='http://hijinksensue.com/', + latestSearch=start, + starter=indirectStarter() ) - if starter: - attrs['starter'] = starter globals()[name] = make_scraper(name, _WordPressScraper, **attrs) @@ -22,4 +22,4 @@ for (name, starterXPath) in [ ('HijinksEnsueConvention', '//h4[text()="Latest Fancy Convention Sketches"]/..//a'), ('HijinksEnsuePhoto', '//h4[text()="Latest Fancy Photo Comic"]/..//a') ]: - add(name, 'http://hijinksensue.com/', starter=indirectStarter('http://hijinksensue.com/', starterXPath)) + add(name, starterXPath) diff --git a/dosagelib/plugins/x.py b/dosagelib/plugins/x.py index 8ec8a7d21..7fa17322c 100644 --- a/dosagelib/plugins/x.py +++ b/dosagelib/plugins/x.py @@ -1,6 +1,9 @@ -# -*- coding: iso-8859-1 -*- +# -*- coding: utf-8 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher + +from __future__ import absolute_import, division, print_function from re import compile @@ -12,13 +15,13 @@ from ..util import tagre class Xkcd(_BasicScraper): name = 'xkcd' url = 'http://xkcd.com/' - starter = bounceStarter(url, compile(tagre("a", "href", r'(/\d+/)', - before="next"))) + starter = bounceStarter() stripUrl = url + '%s/' firstStripUrl = stripUrl % '1' imageSearch = compile(tagre("img", "src", r'(//imgs\.xkcd\.com/comics/[^"]+)')) prevSearch = compile(tagre("a", "href", r'(/\d+/)', before="prev")) + nextSearch = compile(tagre("a", "href", r'(/\d+/)', before="next")) help = 'Index format: n (unpadded)' textSearch = compile(tagre("img", "title", r'([^"]+)', before=r'//imgs\.xkcd\.com/comics/')) diff --git a/dosagelib/plugins/z.py b/dosagelib/plugins/z.py index 02326eda1..07aa54dc6 100644 --- a/dosagelib/plugins/z.py +++ b/dosagelib/plugins/z.py @@ -22,15 +22,16 @@ class ZapComic(_ParserScraper): class Zapiro(_BasicScraper): url = 'http://www.mg.co.za/zapiro/' - starter = bounceStarter( - url, compile(tagre("li", "class", r'nav_older') + - tagre("a", "href", r'(http://mg\.co\.za/cartoon/[^"]+)'))) + starter = bounceStarter() stripUrl = 'http://mg.co.za/cartoon/%s' firstStripUrl = stripUrl % 'zapiro_681' imageSearch = compile(tagre("img", "src", r'(http://cdn\.mg\.co\.za/crop/content/cartoons/[^"]+)')) prevSearch = compile(tagre("li", "class", r'nav_older') + tagre("a", "href", r'(http://mg\.co\.za/cartoon/[^"]+)')) + nextSearch = compile(tagre("li", "class", r'nav_older') + + tagre("a", "href", + r'(http://mg\.co\.za/cartoon/[^"]+)')) help = 'Index format: yyyy-mm-dd-stripname' @classmethod