Unify similar code in fetchUrl and fetchText
This commit is contained in:
parent
6574997e01
commit
fd85c8583a
1 changed files with 44 additions and 40 deletions
|
@ -445,55 +445,59 @@ class _ParserScraper(Scraper):
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def fetchUrls(self, url, data, urlSearch):
|
||||||
"""Search all entries for given XPath in a HTML page."""
|
"""Search all entries for given XPath in a HTML page."""
|
||||||
searchUrls = []
|
searchUrls = []
|
||||||
|
for match, search in self._matchPattern(data, urlSearch):
|
||||||
|
searchUrl = None
|
||||||
|
try:
|
||||||
|
for attrib in html_link_attrs:
|
||||||
|
if attrib in match.attrib:
|
||||||
|
searchUrl = match.get(attrib)
|
||||||
|
except AttributeError:
|
||||||
|
searchUrl = str(match)
|
||||||
|
out.debug(u'Matched URL %r with pattern %s' % (searchUrl, search))
|
||||||
|
if searchUrl is not None:
|
||||||
|
searchUrls.append(searchUrl)
|
||||||
|
|
||||||
|
if not searchUrls:
|
||||||
|
raise ValueError("XPath %s not found at URL %s." %
|
||||||
|
(urlSearch, url))
|
||||||
|
return searchUrls
|
||||||
|
|
||||||
|
def fetchText(self, url, data, textSearch, optional):
|
||||||
|
"""Search text entry for given text XPath in a HTML page."""
|
||||||
|
if not textSearch:
|
||||||
|
return None
|
||||||
|
text = []
|
||||||
|
for match, search in self._matchPattern(data, textSearch):
|
||||||
|
try:
|
||||||
|
text.append(match.text_content())
|
||||||
|
except AttributeError:
|
||||||
|
text.append(match)
|
||||||
|
out.debug(u'Matched text %r with XPath %s' % (text, search))
|
||||||
|
text = u' '.join(text)
|
||||||
|
if text.strip() == '':
|
||||||
|
if optional:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
raise ValueError("XPath %s did not match anything at URL %s." %
|
||||||
|
(textSearch, url))
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
def _matchPattern(self, data, patterns):
|
||||||
if self.css:
|
if self.css:
|
||||||
searchFun = data.cssselect
|
searchFun = data.cssselect
|
||||||
else:
|
else:
|
||||||
def searchFun(s):
|
def searchFun(s):
|
||||||
return data.xpath(s, namespaces=self.NS)
|
return data.xpath(s, namespaces=self.NS)
|
||||||
searches = makeSequence(urlSearch)
|
patterns = makeSequence(patterns)
|
||||||
for search in searches:
|
for search in patterns:
|
||||||
|
matched = False
|
||||||
for match in searchFun(search):
|
for match in searchFun(search):
|
||||||
try:
|
matched = True
|
||||||
for attrib in html_link_attrs:
|
yield match, search
|
||||||
if attrib in match.attrib:
|
|
||||||
searchUrl = match.get(attrib)
|
|
||||||
except AttributeError:
|
|
||||||
searchUrl = str(match)
|
|
||||||
out.debug(u'Matched URL %r with pattern %s' %
|
|
||||||
(searchUrl, search))
|
|
||||||
searchUrls.append(searchUrl)
|
|
||||||
|
|
||||||
if not self.multipleImagesPerStrip and searchUrls:
|
if matched and not self.multipleImagesPerStrip:
|
||||||
# do not search other links if one pattern matched
|
# do not search other links if one pattern matched
|
||||||
break
|
break
|
||||||
if not searchUrls:
|
|
||||||
raise ValueError("XPath %s not found at URL %s." % (searches, url))
|
|
||||||
return searchUrls
|
|
||||||
|
|
||||||
def fetchText(self, url, data, textSearch, optional):
|
|
||||||
"""Search text entry for given text XPath in a HTML page."""
|
|
||||||
if self.css:
|
|
||||||
searchFun = data.cssselect
|
|
||||||
else:
|
|
||||||
searchFun = data.xpath
|
|
||||||
if textSearch:
|
|
||||||
text = ''
|
|
||||||
for match in searchFun(textSearch):
|
|
||||||
try:
|
|
||||||
text += u' ' + match.text_content()
|
|
||||||
except AttributeError:
|
|
||||||
text += u' ' + match
|
|
||||||
if text.strip() == '':
|
|
||||||
if optional:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"XPath %s did not match anything at URL %s." %
|
|
||||||
(textSearch, url))
|
|
||||||
out.debug(u'Matched text %r with XPath %s' % (text, textSearch))
|
|
||||||
return unescape(text).strip()
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getDisabledReasons(self):
|
def getDisabledReasons(self):
|
||||||
res = {}
|
res = {}
|
||||||
|
|
Loading…
Reference in a new issue