Unify similar code in fetchUrl and fetchText

2016-04-22 00:42:46 +02:00 · 2016-04-22 00:42:46 +02:00 · fd85c8583a
commit fd85c8583a
parent 6574997e01
1 changed files with 44 additions and 40 deletions
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -445,55 +445,59 @@ class _ParserScraper(Scraper):
    def fetchUrls(self, url, data, urlSearch):
        """Search all entries for given XPath in a HTML page."""
        searchUrls = []
-        if self.css:
+        for match, search in self._matchPattern(data, urlSearch):
-            searchFun = data.cssselect
+            searchUrl = None
        else:
            def searchFun(s):
                return data.xpath(s, namespaces=self.NS)
        searches = makeSequence(urlSearch)
        for search in searches:
            for match in searchFun(search):
            try:
                for attrib in html_link_attrs:
                    if attrib in match.attrib:
                        searchUrl = match.get(attrib)
            except AttributeError:
                searchUrl = str(match)
-                out.debug(u'Matched URL %r with pattern %s' %
+            out.debug(u'Matched URL %r with pattern %s' % (searchUrl, search))
-                          (searchUrl, search))
+            if searchUrl is not None:
                searchUrls.append(searchUrl)
            if not self.multipleImagesPerStrip and searchUrls:
                # do not search other links if one pattern matched
                break
        if not searchUrls:
-            raise ValueError("XPath %s not found at URL %s." % (searches, url))
+            raise ValueError("XPath %s not found at URL %s." %
                             (urlSearch, url))
        return searchUrls
    def fetchText(self, url, data, textSearch, optional):
        """Search text entry for given text XPath in a HTML page."""
-        if self.css:
+        if not textSearch:
-            searchFun = data.cssselect
+            return None
-        else:
+        text = []
-            searchFun = data.xpath
+        for match, search in self._matchPattern(data, textSearch):
        if textSearch:
            text = ''
            for match in searchFun(textSearch):
            try:
-                    text += u' ' + match.text_content()
+                text.append(match.text_content())
            except AttributeError:
-                    text += u' ' + match
+                text.append(match)
            out.debug(u'Matched text %r with XPath %s' % (text, search))
        text = u' '.join(text)
        if text.strip() == '':
            if optional:
                return None
            else:
-                    raise ValueError(
+                raise ValueError("XPath %s did not match anything at URL %s." %
                        "XPath %s did not match anything at URL %s." %
                                 (textSearch, url))
-            out.debug(u'Matched text %r with XPath %s' % (text, textSearch))
+        return text.strip()
-            return unescape(text).strip()
+
    def _matchPattern(self, data, patterns):
        if self.css:
            searchFun = data.cssselect
        else:
-            return None
+            def searchFun(s):
                return data.xpath(s, namespaces=self.NS)
        patterns = makeSequence(patterns)
        for search in patterns:
            matched = False
            for match in searchFun(search):
                matched = True
                yield match, search
            if matched and not self.multipleImagesPerStrip:
                # do not search other links if one pattern matched
                break
    def getDisabledReasons(self):
        res = {}