Refactor: All the other class methods.
Turns out, it would have been better if all methods had been instance methods and not class methods. This finished a big chunk of the rework needed for #42.
This commit is contained in:
parent
0d436b8ca9
commit
6574997e01
4 changed files with 29 additions and 43 deletions
|
@ -20,9 +20,8 @@ class EarthsongSaga(_ParserScraper):
|
||||||
prevSearch = '//a[@title="Previous"]'
|
prevSearch = '//a[@title="Previous"]'
|
||||||
latestSearch = '//div[@id="leftmenu"]/span[1]/a[1]'
|
latestSearch = '//div[@id="leftmenu"]/span[1]/a[1]'
|
||||||
|
|
||||||
@classmethod
|
def fetchUrls(self, url, data, urlSearch):
|
||||||
def fetchUrls(cls, url, data, urlSearch):
|
urls = super(EarthsongSaga, self).fetchUrls(url, data, urlSearch)
|
||||||
urls = super(EarthsongSaga, cls).fetchUrls(url, data, urlSearch)
|
|
||||||
return [x.replace('earthsongsaga.com/../',
|
return [x.replace('earthsongsaga.com/../',
|
||||||
'earthsongsaga.com/') for x in urls]
|
'earthsongsaga.com/') for x in urls]
|
||||||
|
|
||||||
|
|
|
@ -150,10 +150,9 @@ class ScurryAndCover(_ParserScraper):
|
||||||
nextSearch = '//div[@id="nextpage"]/..'
|
nextSearch = '//div[@id="nextpage"]/..'
|
||||||
imageSearch = 'MARKER'
|
imageSearch = 'MARKER'
|
||||||
|
|
||||||
@classmethod
|
def fetchUrls(self, url, data, urlsearch):
|
||||||
def fetchUrls(cls, url, data, urlSearch):
|
if urlsearch != self.imageSearch:
|
||||||
if urlSearch != cls.imageSearch:
|
return super(ScurryAndCover, self).fetchUrls(url, data, urlsearch)
|
||||||
return super(ScurryAndCover, cls).fetchUrls(url, data, urlSearch)
|
|
||||||
|
|
||||||
# get javascript element and parse a variable value
|
# get javascript element and parse a variable value
|
||||||
scripts = data.xpath('//body/script[@type="text/javascript"]')
|
scripts = data.xpath('//body/script[@type="text/javascript"]')
|
||||||
|
@ -163,7 +162,7 @@ class ScurryAndCover(_ParserScraper):
|
||||||
images = regex.findall(script.text)
|
images = regex.findall(script.text)
|
||||||
if len(images) > 0:
|
if len(images) > 0:
|
||||||
image = images[0]
|
image = images[0]
|
||||||
return [cls.url + '/images/pages/' + image + '-xsmall.png']
|
return [self.url + '/images/pages/' + image + '-xsmall.png']
|
||||||
|
|
||||||
def starter(self):
|
def starter(self):
|
||||||
"""Go forward as far as possibe, then start."""
|
"""Go forward as far as possibe, then start."""
|
||||||
|
|
|
@ -56,10 +56,8 @@ class Stellar(_WLPComics):
|
||||||
url = 'http://www.wlpcomics.com/adult/stellar/'
|
url = 'http://www.wlpcomics.com/adult/stellar/'
|
||||||
adult = True
|
adult = True
|
||||||
|
|
||||||
@classmethod
|
def prevUrlModifier(self, prev_url):
|
||||||
def fetchUrls(cls, url, data, urlSearch):
|
|
||||||
"""Bugfix for empty page..."""
|
"""Bugfix for empty page..."""
|
||||||
urls = super(Stellar, cls).fetchUrls(url, data, urlSearch)
|
if prev_url == self.url + '075.html':
|
||||||
if cls.url + '075.html' in urls:
|
return self.url + '074.html'
|
||||||
urls = [cls.url + '074.html']
|
return prev_url
|
||||||
return urls
|
|
||||||
|
|
|
@ -280,8 +280,7 @@ class Scraper(object):
|
||||||
with open(filename, 'w') as f:
|
with open(filename, 'w') as f:
|
||||||
f.write('All comics should be downloaded here.')
|
f.write('All comics should be downloaded here.')
|
||||||
|
|
||||||
@classmethod
|
def getPage(self, url):
|
||||||
def getPage(cls, url):
|
|
||||||
"""
|
"""
|
||||||
Fetch a page and return the opaque repesentation for the data parameter
|
Fetch a page and return the opaque repesentation for the data parameter
|
||||||
of fetchUrls and fetchText.
|
of fetchUrls and fetchText.
|
||||||
|
@ -295,16 +294,13 @@ class Scraper(object):
|
||||||
"""
|
"""
|
||||||
raise ValueError("No implementation for getPage!")
|
raise ValueError("No implementation for getPage!")
|
||||||
|
|
||||||
@classmethod
|
def fetchUrls(self, url, data, urlsearch):
|
||||||
def fetchUrls(cls, url, data, urlSearch):
|
|
||||||
raise ValueError("No implementation for fetchUrls!")
|
raise ValueError("No implementation for fetchUrls!")
|
||||||
|
|
||||||
@classmethod
|
def fetchUrl(self, url, data, urlsearch):
|
||||||
def fetchUrl(cls, url, data, urlSearch):
|
return self.fetchUrls(url, data, urlsearch)[0]
|
||||||
return cls.fetchUrls(url, data, urlSearch)[0]
|
|
||||||
|
|
||||||
@classmethod
|
def fetchText(self, url, data, textsearch, optional):
|
||||||
def fetchText(cls, url, data, textSearch, optional):
|
|
||||||
raise ValueError("No implementation for fetchText!")
|
raise ValueError("No implementation for fetchText!")
|
||||||
|
|
||||||
def getDisabledReasons(self):
|
def getDisabledReasons(self):
|
||||||
|
@ -351,20 +347,18 @@ class _BasicScraper(Scraper):
|
||||||
|
|
||||||
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
|
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
|
||||||
|
|
||||||
@classmethod
|
def getPage(self, url):
|
||||||
def getPage(cls, url):
|
content = get_page(url, self.session).text
|
||||||
content = get_page(url, cls.session).text
|
|
||||||
# determine base URL
|
# determine base URL
|
||||||
baseUrl = None
|
baseUrl = None
|
||||||
match = cls.BASE_SEARCH.search(content)
|
match = self.BASE_SEARCH.search(content)
|
||||||
if match:
|
if match:
|
||||||
baseUrl = match.group(1)
|
baseUrl = match.group(1)
|
||||||
else:
|
else:
|
||||||
baseUrl = url
|
baseUrl = url
|
||||||
return (content, baseUrl)
|
return (content, baseUrl)
|
||||||
|
|
||||||
@classmethod
|
def fetchUrls(self, url, data, urlSearch):
|
||||||
def fetchUrls(cls, url, data, urlSearch):
|
|
||||||
"""Search all entries for given URL pattern(s) in a HTML page."""
|
"""Search all entries for given URL pattern(s) in a HTML page."""
|
||||||
searchUrls = []
|
searchUrls = []
|
||||||
searches = makeSequence(urlSearch)
|
searches = makeSequence(urlSearch)
|
||||||
|
@ -386,8 +380,7 @@ class _BasicScraper(Scraper):
|
||||||
(patterns, url))
|
(patterns, url))
|
||||||
return searchUrls
|
return searchUrls
|
||||||
|
|
||||||
@classmethod
|
def fetchText(self, url, data, textSearch, optional):
|
||||||
def fetchText(cls, url, data, textSearch, optional):
|
|
||||||
"""Search text entry for given text pattern in a HTML page."""
|
"""Search text entry for given text pattern in a HTML page."""
|
||||||
if textSearch:
|
if textSearch:
|
||||||
match = textSearch.search(data[0])
|
match = textSearch.search(data[0])
|
||||||
|
@ -434,31 +427,29 @@ class _ParserScraper(Scraper):
|
||||||
# another Python module, XPath is the default for now.
|
# another Python module, XPath is the default for now.
|
||||||
css = False
|
css = False
|
||||||
|
|
||||||
@classmethod
|
def getPage(self, url):
|
||||||
def getPage(cls, url):
|
page = get_page(url, self.session)
|
||||||
page = get_page(url, cls.session)
|
|
||||||
if page.encoding:
|
if page.encoding:
|
||||||
# Requests figured out the encoding, so we can deliver Unicode to
|
# Requests figured out the encoding, so we can deliver Unicode to
|
||||||
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML
|
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML
|
||||||
# declaration with (probably wrong!) encoding at the top of the
|
# declaration with (probably wrong!) encoding at the top of the
|
||||||
# document. Web browsers ignore such if the encoding was specified
|
# document. Web browsers ignore such if the encoding was specified
|
||||||
# in the HTTP header and so do we.
|
# in the HTTP header and so do we.
|
||||||
text = cls.XML_DECL.sub('\1\2', page.text, count=1)
|
text = self.XML_DECL.sub('\1\2', page.text, count=1)
|
||||||
tree = html.document_fromstring(text)
|
tree = html.document_fromstring(text)
|
||||||
else:
|
else:
|
||||||
tree = html.document_fromstring(page.content)
|
tree = html.document_fromstring(page.content)
|
||||||
tree.make_links_absolute(url)
|
tree.make_links_absolute(url)
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
@classmethod
|
def fetchUrls(self, url, data, urlSearch):
|
||||||
def fetchUrls(cls, url, data, urlSearch):
|
|
||||||
"""Search all entries for given XPath in a HTML page."""
|
"""Search all entries for given XPath in a HTML page."""
|
||||||
searchUrls = []
|
searchUrls = []
|
||||||
if cls.css:
|
if self.css:
|
||||||
searchFun = data.cssselect
|
searchFun = data.cssselect
|
||||||
else:
|
else:
|
||||||
def searchFun(s):
|
def searchFun(s):
|
||||||
return data.xpath(s, namespaces=cls.NS)
|
return data.xpath(s, namespaces=self.NS)
|
||||||
searches = makeSequence(urlSearch)
|
searches = makeSequence(urlSearch)
|
||||||
for search in searches:
|
for search in searches:
|
||||||
for match in searchFun(search):
|
for match in searchFun(search):
|
||||||
|
@ -472,17 +463,16 @@ class _ParserScraper(Scraper):
|
||||||
(searchUrl, search))
|
(searchUrl, search))
|
||||||
searchUrls.append(searchUrl)
|
searchUrls.append(searchUrl)
|
||||||
|
|
||||||
if not cls.multipleImagesPerStrip and searchUrls:
|
if not self.multipleImagesPerStrip and searchUrls:
|
||||||
# do not search other links if one pattern matched
|
# do not search other links if one pattern matched
|
||||||
break
|
break
|
||||||
if not searchUrls:
|
if not searchUrls:
|
||||||
raise ValueError("XPath %s not found at URL %s." % (searches, url))
|
raise ValueError("XPath %s not found at URL %s." % (searches, url))
|
||||||
return searchUrls
|
return searchUrls
|
||||||
|
|
||||||
@classmethod
|
def fetchText(self, url, data, textSearch, optional):
|
||||||
def fetchText(cls, url, data, textSearch, optional):
|
|
||||||
"""Search text entry for given text XPath in a HTML page."""
|
"""Search text entry for given text XPath in a HTML page."""
|
||||||
if cls.css:
|
if self.css:
|
||||||
searchFun = data.cssselect
|
searchFun = data.cssselect
|
||||||
else:
|
else:
|
||||||
searchFun = data.xpath
|
searchFun = data.xpath
|
||||||
|
|
Loading…
Reference in a new issue