Refactor: All the other class methods.

Turns out, it would have been better if all methods had been instance
methods and not class methods. This finished a big chunk of the rework
needed for #42.
This commit is contained in:
Tobias Gruetzmacher 2016-04-21 23:52:31 +02:00
parent 0d436b8ca9
commit 6574997e01
4 changed files with 29 additions and 43 deletions

View file

@ -20,9 +20,8 @@ class EarthsongSaga(_ParserScraper):
prevSearch = '//a[@title="Previous"]' prevSearch = '//a[@title="Previous"]'
latestSearch = '//div[@id="leftmenu"]/span[1]/a[1]' latestSearch = '//div[@id="leftmenu"]/span[1]/a[1]'
@classmethod def fetchUrls(self, url, data, urlSearch):
def fetchUrls(cls, url, data, urlSearch): urls = super(EarthsongSaga, self).fetchUrls(url, data, urlSearch)
urls = super(EarthsongSaga, cls).fetchUrls(url, data, urlSearch)
return [x.replace('earthsongsaga.com/../', return [x.replace('earthsongsaga.com/../',
'earthsongsaga.com/') for x in urls] 'earthsongsaga.com/') for x in urls]

View file

@ -150,10 +150,9 @@ class ScurryAndCover(_ParserScraper):
nextSearch = '//div[@id="nextpage"]/..' nextSearch = '//div[@id="nextpage"]/..'
imageSearch = 'MARKER' imageSearch = 'MARKER'
@classmethod def fetchUrls(self, url, data, urlsearch):
def fetchUrls(cls, url, data, urlSearch): if urlsearch != self.imageSearch:
if urlSearch != cls.imageSearch: return super(ScurryAndCover, self).fetchUrls(url, data, urlsearch)
return super(ScurryAndCover, cls).fetchUrls(url, data, urlSearch)
# get javascript element and parse a variable value # get javascript element and parse a variable value
scripts = data.xpath('//body/script[@type="text/javascript"]') scripts = data.xpath('//body/script[@type="text/javascript"]')
@ -163,7 +162,7 @@ class ScurryAndCover(_ParserScraper):
images = regex.findall(script.text) images = regex.findall(script.text)
if len(images) > 0: if len(images) > 0:
image = images[0] image = images[0]
return [cls.url + '/images/pages/' + image + '-xsmall.png'] return [self.url + '/images/pages/' + image + '-xsmall.png']
def starter(self): def starter(self):
"""Go forward as far as possibe, then start.""" """Go forward as far as possibe, then start."""

View file

@ -56,10 +56,8 @@ class Stellar(_WLPComics):
url = 'http://www.wlpcomics.com/adult/stellar/' url = 'http://www.wlpcomics.com/adult/stellar/'
adult = True adult = True
@classmethod def prevUrlModifier(self, prev_url):
def fetchUrls(cls, url, data, urlSearch):
"""Bugfix for empty page...""" """Bugfix for empty page..."""
urls = super(Stellar, cls).fetchUrls(url, data, urlSearch) if prev_url == self.url + '075.html':
if cls.url + '075.html' in urls: return self.url + '074.html'
urls = [cls.url + '074.html'] return prev_url
return urls

View file

@ -280,8 +280,7 @@ class Scraper(object):
with open(filename, 'w') as f: with open(filename, 'w') as f:
f.write('All comics should be downloaded here.') f.write('All comics should be downloaded here.')
@classmethod def getPage(self, url):
def getPage(cls, url):
""" """
Fetch a page and return the opaque repesentation for the data parameter Fetch a page and return the opaque repesentation for the data parameter
of fetchUrls and fetchText. of fetchUrls and fetchText.
@ -295,16 +294,13 @@ class Scraper(object):
""" """
raise ValueError("No implementation for getPage!") raise ValueError("No implementation for getPage!")
@classmethod def fetchUrls(self, url, data, urlsearch):
def fetchUrls(cls, url, data, urlSearch):
raise ValueError("No implementation for fetchUrls!") raise ValueError("No implementation for fetchUrls!")
@classmethod def fetchUrl(self, url, data, urlsearch):
def fetchUrl(cls, url, data, urlSearch): return self.fetchUrls(url, data, urlsearch)[0]
return cls.fetchUrls(url, data, urlSearch)[0]
@classmethod def fetchText(self, url, data, textsearch, optional):
def fetchText(cls, url, data, textSearch, optional):
raise ValueError("No implementation for fetchText!") raise ValueError("No implementation for fetchText!")
def getDisabledReasons(self): def getDisabledReasons(self):
@ -351,20 +347,18 @@ class _BasicScraper(Scraper):
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)')) BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
@classmethod def getPage(self, url):
def getPage(cls, url): content = get_page(url, self.session).text
content = get_page(url, cls.session).text
# determine base URL # determine base URL
baseUrl = None baseUrl = None
match = cls.BASE_SEARCH.search(content) match = self.BASE_SEARCH.search(content)
if match: if match:
baseUrl = match.group(1) baseUrl = match.group(1)
else: else:
baseUrl = url baseUrl = url
return (content, baseUrl) return (content, baseUrl)
@classmethod def fetchUrls(self, url, data, urlSearch):
def fetchUrls(cls, url, data, urlSearch):
"""Search all entries for given URL pattern(s) in a HTML page.""" """Search all entries for given URL pattern(s) in a HTML page."""
searchUrls = [] searchUrls = []
searches = makeSequence(urlSearch) searches = makeSequence(urlSearch)
@ -386,8 +380,7 @@ class _BasicScraper(Scraper):
(patterns, url)) (patterns, url))
return searchUrls return searchUrls
@classmethod def fetchText(self, url, data, textSearch, optional):
def fetchText(cls, url, data, textSearch, optional):
"""Search text entry for given text pattern in a HTML page.""" """Search text entry for given text pattern in a HTML page."""
if textSearch: if textSearch:
match = textSearch.search(data[0]) match = textSearch.search(data[0])
@ -434,31 +427,29 @@ class _ParserScraper(Scraper):
# another Python module, XPath is the default for now. # another Python module, XPath is the default for now.
css = False css = False
@classmethod def getPage(self, url):
def getPage(cls, url): page = get_page(url, self.session)
page = get_page(url, cls.session)
if page.encoding: if page.encoding:
# Requests figured out the encoding, so we can deliver Unicode to # Requests figured out the encoding, so we can deliver Unicode to
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML # LXML. Unfortunatly, LXML feels betrayed if there is still an XML
# declaration with (probably wrong!) encoding at the top of the # declaration with (probably wrong!) encoding at the top of the
# document. Web browsers ignore such if the encoding was specified # document. Web browsers ignore such if the encoding was specified
# in the HTTP header and so do we. # in the HTTP header and so do we.
text = cls.XML_DECL.sub('\1\2', page.text, count=1) text = self.XML_DECL.sub('\1\2', page.text, count=1)
tree = html.document_fromstring(text) tree = html.document_fromstring(text)
else: else:
tree = html.document_fromstring(page.content) tree = html.document_fromstring(page.content)
tree.make_links_absolute(url) tree.make_links_absolute(url)
return tree return tree
@classmethod def fetchUrls(self, url, data, urlSearch):
def fetchUrls(cls, url, data, urlSearch):
"""Search all entries for given XPath in a HTML page.""" """Search all entries for given XPath in a HTML page."""
searchUrls = [] searchUrls = []
if cls.css: if self.css:
searchFun = data.cssselect searchFun = data.cssselect
else: else:
def searchFun(s): def searchFun(s):
return data.xpath(s, namespaces=cls.NS) return data.xpath(s, namespaces=self.NS)
searches = makeSequence(urlSearch) searches = makeSequence(urlSearch)
for search in searches: for search in searches:
for match in searchFun(search): for match in searchFun(search):
@ -472,17 +463,16 @@ class _ParserScraper(Scraper):
(searchUrl, search)) (searchUrl, search))
searchUrls.append(searchUrl) searchUrls.append(searchUrl)
if not cls.multipleImagesPerStrip and searchUrls: if not self.multipleImagesPerStrip and searchUrls:
# do not search other links if one pattern matched # do not search other links if one pattern matched
break break
if not searchUrls: if not searchUrls:
raise ValueError("XPath %s not found at URL %s." % (searches, url)) raise ValueError("XPath %s not found at URL %s." % (searches, url))
return searchUrls return searchUrls
@classmethod def fetchText(self, url, data, textSearch, optional):
def fetchText(cls, url, data, textSearch, optional):
"""Search text entry for given text XPath in a HTML page.""" """Search text entry for given text XPath in a HTML page."""
if cls.css: if self.css:
searchFun = data.cssselect searchFun = data.cssselect
else: else:
searchFun = data.xpath searchFun = data.xpath