From 2567bd4e57e7b75342a10a1b308e7d9600d835e6 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Wed, 23 Jul 2014 20:53:59 +0200 Subject: [PATCH] Convert starters and other helpers to new interface. This allows those starters to work with future scrapers. --- dosagelib/helpers.py | 14 +++++++------- dosagelib/plugins/clonemanga.py | 12 ++++++------ dosagelib/plugins/drunkduck.py | 12 ++++++------ dosagelib/plugins/h.py | 6 +++--- dosagelib/plugins/p.py | 10 +++++----- dosagelib/plugins/smackjeeves.py | 10 +++++----- dosagelib/plugins/t.py | 10 +++++----- 7 files changed, 37 insertions(+), 37 deletions(-) diff --git a/dosagelib/helpers.py b/dosagelib/helpers.py index 53118be46..7b7a62940 100644 --- a/dosagelib/helpers.py +++ b/dosagelib/helpers.py @@ -1,7 +1,7 @@ # -*- coding: iso-8859-1 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -from .util import fetchUrl, getPageContent, getQueryParams +from .util import getQueryParams def queryNamer(paramName, usePageUrl=False): """Get name from URL query part.""" @@ -30,10 +30,10 @@ def bounceStarter(url, nextSearch): @classmethod def _starter(cls): """Get bounced start URL.""" - data, baseUrl = getPageContent(url, cls.session) - url1 = fetchUrl(url, data, baseUrl, cls.prevSearch) - data, baseUrl = getPageContent(url1, cls.session) - return fetchUrl(url1, data, baseUrl, nextSearch) + data = cls.getPage(url) + url1 = cls.fetchUrl(url, data, cls.prevSearch) + data = cls.getPage(url1) + return cls.fetchUrl(url1, data, nextSearch) return _starter @@ -42,6 +42,6 @@ def indirectStarter(url, latestSearch): @classmethod def _starter(cls): """Get indirect start URL.""" - data, baseUrl = getPageContent(url, cls.session) - return fetchUrl(url, data, baseUrl, latestSearch) + data = cls.getPage(url) + return cls.fetchUrl(url, data, latestSearch) return _starter diff --git a/dosagelib/plugins/clonemanga.py b/dosagelib/plugins/clonemanga.py index 93b4f4107..a26310aec 100644 --- a/dosagelib/plugins/clonemanga.py +++ b/dosagelib/plugins/clonemanga.py @@ -3,7 +3,7 @@ # Copyright (C) 2012-2014 Bastian Kleineidam from re import compile from ..scraper import make_scraper -from ..util import tagre, getQueryParams, fetchUrl, getPageContent +from ..util import tagre, getQueryParams _linkTag = tagre("a", "href", r'([^"]+)') @@ -25,15 +25,15 @@ def add(name, shortName, imageFolder=None, lastStrip=None): @classmethod def _starter(cls): # first, try hopping to previous and next comic - data, _baseUrl = getPageContent(baseUrl, cls.session) + data = cls.getPage(baseUrl) try: - url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch) + url = cls.fetchUrl(baseUrl, data, _prevSearch) except ValueError: # no previous link found, try hopping to last comic - return fetchUrl(baseUrl, data, _baseUrl, _lastSearch) + return cls.fetchUrl(baseUrl, data, _lastSearch) else: - data, _baseUrl = getPageContent(url, cls.session) - return fetchUrl(url, data, _baseUrl, _nextSearch) + data = cls.getPage(url) + return cls.fetchUrl(url, data, _nextSearch) attrs = dict( name='CloneManga/' + name, diff --git a/dosagelib/plugins/drunkduck.py b/dosagelib/plugins/drunkduck.py index 8a48888f6..a0c385335 100644 --- a/dosagelib/plugins/drunkduck.py +++ b/dosagelib/plugins/drunkduck.py @@ -4,7 +4,7 @@ from re import compile from ..scraper import make_scraper, Genre -from ..util import tagre, fetchUrl, getPageContent +from ..util import tagre # note: adding the compile() functions inside add() is a major performance hog _imageSearch = compile(tagre("img", "src", r'(https://s3\.amazonaws\.com/media\.drunkduck\.com/[^"]+)', before="page-image")) @@ -27,15 +27,15 @@ def add(name, path): @classmethod def _starter(cls): # first, try hopping to previous and next comic - data, baseUrl = getPageContent(_url, cls.session) + data = cls.getPage(_url) try: - url = fetchUrl(_url, data, baseUrl, _prevSearch) + url = cls.fetchUrl(_url, data, _prevSearch) except ValueError: # no previous link found, try hopping to last comic - return fetchUrl(_url, data, baseUrl, _lastSearch) + return cls.fetchUrl(_url, data, _lastSearch) else: - data, baseUrl = getPageContent(url, cls.session) - return fetchUrl(url, data, baseUrl, _nextSearch) + data = cls.getPage(url) + return cls.fetchUrl(url, data, _nextSearch) attrs = dict( name = 'DrunkDuck/' + name, diff --git a/dosagelib/plugins/h.py b/dosagelib/plugins/h.py index dd52c8362..daeaea9d8 100644 --- a/dosagelib/plugins/h.py +++ b/dosagelib/plugins/h.py @@ -3,7 +3,7 @@ from re import compile, escape from ..scraper import _BasicScraper -from ..util import tagre, getPageContent, fetchUrls +from ..util import tagre from ..helpers import bounceStarter @@ -21,9 +21,9 @@ class HagarTheHorrible(_BasicScraper): def starter(cls): """Return last gallery link.""" url = 'http://www.hagardunor.net/comics.php' - content = getPageContent(url, cls.session)[0] + data = cls.getPage(url) pattern = compile(tagre("a", "href", cls.prevUrl)) - for starturl in fetchUrls(url, content, url, pattern): + for starturl in cls.fetchUrls(url, data, pattern): pass return starturl diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index 794299975..7fa3420a5 100755 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -5,7 +5,7 @@ from re import compile, escape from ..scraper import _BasicScraper from ..helpers import bounceStarter, queryNamer, indirectStarter -from ..util import tagre, fetchUrl, getPageContent +from ..util import tagre class PandyLand(_BasicScraper): @@ -104,10 +104,10 @@ class PennyArcade(_BasicScraper): @classmethod def starter(cls): """Get bounced start URL.""" - data, baseUrl = getPageContent(cls.url, cls.session) - url1 = fetchUrl(cls.url, data, baseUrl, cls.prevSearch) - data, baseUrl = getPageContent(url1, cls.session) - url2 = fetchUrl(url1, data, baseUrl, cls.nextSearch) + data = cls.getPage(cls.url) + url1 = cls.fetchUrl(cls.url, data, cls.prevSearch) + data = cls.getPage(url1) + url2 = cls.fetchUrl(url1, data, cls.nextSearch) return cls.prevUrlModifier(url2) @classmethod diff --git a/dosagelib/plugins/smackjeeves.py b/dosagelib/plugins/smackjeeves.py index 766e3ee2b..111f56e1d 100644 --- a/dosagelib/plugins/smackjeeves.py +++ b/dosagelib/plugins/smackjeeves.py @@ -3,7 +3,7 @@ # Copyright (C) 2012-2014 Bastian Kleineidam from re import compile from ..scraper import make_scraper -from ..util import tagre, quote, fetchUrl, case_insensitive_re, getPageContent +from ..util import tagre, quote, case_insensitive_re # SmackJeeves is a crawlers nightmare - users are allowed to edit HTML directly. # That's why there are so much different search patterns. @@ -45,11 +45,11 @@ def add(name, url, description, adult, bounce): def _starter(cls): """Get start URL.""" url1 = modifier(url) - data, baseUrl = getPageContent(url1, cls.session) - url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch) + data = cls.getPage(url1) + url2 = cls.fetchUrl(url1, data, cls.prevSearch) if bounce: - data, baseUrl = getPageContent(url2, cls.session) - url3 = fetchUrl(url2, data, baseUrl, _nextSearch) + data = cls.getPage(url2) + url3 = cls.fetchUrl(url2, data, _nextSearch) return modifier(url3) return modifier(url2) diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py index e43ab48ff..97403153d 100755 --- a/dosagelib/plugins/t.py +++ b/dosagelib/plugins/t.py @@ -5,7 +5,7 @@ from re import compile, escape, IGNORECASE from ..scraper import _BasicScraper from ..helpers import indirectStarter -from ..util import tagre, fetchUrl, getPageContent +from ..util import tagre class TheBrads(_BasicScraper): @@ -223,11 +223,11 @@ class TheThinHLine(_BasicScraper): indirectImageSearch = compile(tagre('a', 'href', r'(%simage/\d+)' % rurl)) - def getComicStrip(self, url, data, baseUrl): + def getComicStrip(self, url, data): """The comic strip image is in a separate page.""" - pageUrl = fetchUrl(url, data, baseUrl, self.indirectImageSearch) - pageData, pageBaseUrl = getPageContent(pageUrl, self.session) - return super(TheThinHLine, self).getComicStrip(pageUrl, pageData, pageBaseUrl) + pageUrl = self.fetchUrl(url, data, self.indirectImageSearch) + pageData = self.getPage(pageUrl) + return super(TheThinHLine, self).getComicStrip(pageUrl, pageData) @classmethod def namer(cls, imageUrl, pageUrl):