Refactor: All the other class methods.

Turns out, it would have been better if all methods had been instance
methods and not class methods. This finished a big chunk of the rework
needed for #42.
This commit is contained in:
Tobias Gruetzmacher 2016-04-21 23:52:31 +02:00
parent 0d436b8ca9
commit 6574997e01
4 changed files with 29 additions and 43 deletions

View file

@ -20,9 +20,8 @@ class EarthsongSaga(_ParserScraper):
prevSearch = '//a[@title="Previous"]'
latestSearch = '//div[@id="leftmenu"]/span[1]/a[1]'
@classmethod
def fetchUrls(cls, url, data, urlSearch):
urls = super(EarthsongSaga, cls).fetchUrls(url, data, urlSearch)
def fetchUrls(self, url, data, urlSearch):
urls = super(EarthsongSaga, self).fetchUrls(url, data, urlSearch)
return [x.replace('earthsongsaga.com/../',
'earthsongsaga.com/') for x in urls]

View file

@ -150,10 +150,9 @@ class ScurryAndCover(_ParserScraper):
nextSearch = '//div[@id="nextpage"]/..'
imageSearch = 'MARKER'
@classmethod
def fetchUrls(cls, url, data, urlSearch):
if urlSearch != cls.imageSearch:
return super(ScurryAndCover, cls).fetchUrls(url, data, urlSearch)
def fetchUrls(self, url, data, urlsearch):
if urlsearch != self.imageSearch:
return super(ScurryAndCover, self).fetchUrls(url, data, urlsearch)
# get javascript element and parse a variable value
scripts = data.xpath('//body/script[@type="text/javascript"]')
@ -163,7 +162,7 @@ class ScurryAndCover(_ParserScraper):
images = regex.findall(script.text)
if len(images) > 0:
image = images[0]
return [cls.url + '/images/pages/' + image + '-xsmall.png']
return [self.url + '/images/pages/' + image + '-xsmall.png']
def starter(self):
"""Go forward as far as possibe, then start."""

View file

@ -56,10 +56,8 @@ class Stellar(_WLPComics):
url = 'http://www.wlpcomics.com/adult/stellar/'
adult = True
@classmethod
def fetchUrls(cls, url, data, urlSearch):
def prevUrlModifier(self, prev_url):
"""Bugfix for empty page..."""
urls = super(Stellar, cls).fetchUrls(url, data, urlSearch)
if cls.url + '075.html' in urls:
urls = [cls.url + '074.html']
return urls
if prev_url == self.url + '075.html':
return self.url + '074.html'
return prev_url

View file

@ -280,8 +280,7 @@ class Scraper(object):
with open(filename, 'w') as f:
f.write('All comics should be downloaded here.')
@classmethod
def getPage(cls, url):
def getPage(self, url):
"""
Fetch a page and return the opaque repesentation for the data parameter
of fetchUrls and fetchText.
@ -295,16 +294,13 @@ class Scraper(object):
"""
raise ValueError("No implementation for getPage!")
@classmethod
def fetchUrls(cls, url, data, urlSearch):
def fetchUrls(self, url, data, urlsearch):
raise ValueError("No implementation for fetchUrls!")
@classmethod
def fetchUrl(cls, url, data, urlSearch):
return cls.fetchUrls(url, data, urlSearch)[0]
def fetchUrl(self, url, data, urlsearch):
return self.fetchUrls(url, data, urlsearch)[0]
@classmethod
def fetchText(cls, url, data, textSearch, optional):
def fetchText(self, url, data, textsearch, optional):
raise ValueError("No implementation for fetchText!")
def getDisabledReasons(self):
@ -351,20 +347,18 @@ class _BasicScraper(Scraper):
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
@classmethod
def getPage(cls, url):
content = get_page(url, cls.session).text
def getPage(self, url):
content = get_page(url, self.session).text
# determine base URL
baseUrl = None
match = cls.BASE_SEARCH.search(content)
match = self.BASE_SEARCH.search(content)
if match:
baseUrl = match.group(1)
else:
baseUrl = url
return (content, baseUrl)
@classmethod
def fetchUrls(cls, url, data, urlSearch):
def fetchUrls(self, url, data, urlSearch):
"""Search all entries for given URL pattern(s) in a HTML page."""
searchUrls = []
searches = makeSequence(urlSearch)
@ -386,8 +380,7 @@ class _BasicScraper(Scraper):
(patterns, url))
return searchUrls
@classmethod
def fetchText(cls, url, data, textSearch, optional):
def fetchText(self, url, data, textSearch, optional):
"""Search text entry for given text pattern in a HTML page."""
if textSearch:
match = textSearch.search(data[0])
@ -434,31 +427,29 @@ class _ParserScraper(Scraper):
# another Python module, XPath is the default for now.
css = False
@classmethod
def getPage(cls, url):
page = get_page(url, cls.session)
def getPage(self, url):
page = get_page(url, self.session)
if page.encoding:
# Requests figured out the encoding, so we can deliver Unicode to
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML
# declaration with (probably wrong!) encoding at the top of the
# document. Web browsers ignore such if the encoding was specified
# in the HTTP header and so do we.
text = cls.XML_DECL.sub('\1\2', page.text, count=1)
text = self.XML_DECL.sub('\1\2', page.text, count=1)
tree = html.document_fromstring(text)
else:
tree = html.document_fromstring(page.content)
tree.make_links_absolute(url)
return tree
@classmethod
def fetchUrls(cls, url, data, urlSearch):
def fetchUrls(self, url, data, urlSearch):
"""Search all entries for given XPath in a HTML page."""
searchUrls = []
if cls.css:
if self.css:
searchFun = data.cssselect
else:
def searchFun(s):
return data.xpath(s, namespaces=cls.NS)
return data.xpath(s, namespaces=self.NS)
searches = makeSequence(urlSearch)
for search in searches:
for match in searchFun(search):
@ -472,17 +463,16 @@ class _ParserScraper(Scraper):
(searchUrl, search))
searchUrls.append(searchUrl)
if not cls.multipleImagesPerStrip and searchUrls:
if not self.multipleImagesPerStrip and searchUrls:
# do not search other links if one pattern matched
break
if not searchUrls:
raise ValueError("XPath %s not found at URL %s." % (searches, url))
return searchUrls
@classmethod
def fetchText(cls, url, data, textSearch, optional):
def fetchText(self, url, data, textSearch, optional):
"""Search text entry for given text XPath in a HTML page."""
if cls.css:
if self.css:
searchFun = data.cssselect
else:
searchFun = data.xpath