Unify similar code in fetchUrl and fetchText

This commit is contained in:
Tobias Gruetzmacher 2016-04-22 00:42:46 +02:00
parent 6574997e01
commit fd85c8583a

View file

@ -445,55 +445,59 @@ class _ParserScraper(Scraper):
def fetchUrls(self, url, data, urlSearch): def fetchUrls(self, url, data, urlSearch):
"""Search all entries for given XPath in a HTML page.""" """Search all entries for given XPath in a HTML page."""
searchUrls = [] searchUrls = []
if self.css: for match, search in self._matchPattern(data, urlSearch):
searchFun = data.cssselect searchUrl = None
else:
def searchFun(s):
return data.xpath(s, namespaces=self.NS)
searches = makeSequence(urlSearch)
for search in searches:
for match in searchFun(search):
try: try:
for attrib in html_link_attrs: for attrib in html_link_attrs:
if attrib in match.attrib: if attrib in match.attrib:
searchUrl = match.get(attrib) searchUrl = match.get(attrib)
except AttributeError: except AttributeError:
searchUrl = str(match) searchUrl = str(match)
out.debug(u'Matched URL %r with pattern %s' % out.debug(u'Matched URL %r with pattern %s' % (searchUrl, search))
(searchUrl, search)) if searchUrl is not None:
searchUrls.append(searchUrl) searchUrls.append(searchUrl)
if not self.multipleImagesPerStrip and searchUrls:
# do not search other links if one pattern matched
break
if not searchUrls: if not searchUrls:
raise ValueError("XPath %s not found at URL %s." % (searches, url)) raise ValueError("XPath %s not found at URL %s." %
(urlSearch, url))
return searchUrls return searchUrls
def fetchText(self, url, data, textSearch, optional): def fetchText(self, url, data, textSearch, optional):
"""Search text entry for given text XPath in a HTML page.""" """Search text entry for given text XPath in a HTML page."""
if self.css: if not textSearch:
searchFun = data.cssselect return None
else: text = []
searchFun = data.xpath for match, search in self._matchPattern(data, textSearch):
if textSearch:
text = ''
for match in searchFun(textSearch):
try: try:
text += u' ' + match.text_content() text.append(match.text_content())
except AttributeError: except AttributeError:
text += u' ' + match text.append(match)
out.debug(u'Matched text %r with XPath %s' % (text, search))
text = u' '.join(text)
if text.strip() == '': if text.strip() == '':
if optional: if optional:
return None return None
else: else:
raise ValueError( raise ValueError("XPath %s did not match anything at URL %s." %
"XPath %s did not match anything at URL %s." %
(textSearch, url)) (textSearch, url))
out.debug(u'Matched text %r with XPath %s' % (text, textSearch)) return text.strip()
return unescape(text).strip()
def _matchPattern(self, data, patterns):
if self.css:
searchFun = data.cssselect
else: else:
return None def searchFun(s):
return data.xpath(s, namespaces=self.NS)
patterns = makeSequence(patterns)
for search in patterns:
matched = False
for match in searchFun(search):
matched = True
yield match, search
if matched and not self.multipleImagesPerStrip:
# do not search other links if one pattern matched
break
def getDisabledReasons(self): def getDisabledReasons(self):
res = {} res = {}