From fd85c8583aaaea300d6383b76d65f5bc9ffd0643 Mon Sep 17 00:00:00 2001
From: Tobias Gruetzmacher <tobias-git@23.gs>
Date: Fri, 22 Apr 2016 00:42:46 +0200
Subject: [PATCH] Unify similar code in fetchUrl and fetchText

---
 dosagelib/scraper.py | 84 +++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 40 deletions(-)

diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py
index 54778bb3f..46df0350d 100644
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@@ -445,55 +445,59 @@ class _ParserScraper(Scraper):
     def fetchUrls(self, url, data, urlSearch):
         """Search all entries for given XPath in a HTML page."""
         searchUrls = []
+        for match, search in self._matchPattern(data, urlSearch):
+            searchUrl = None
+            try:
+                for attrib in html_link_attrs:
+                    if attrib in match.attrib:
+                        searchUrl = match.get(attrib)
+            except AttributeError:
+                searchUrl = str(match)
+            out.debug(u'Matched URL %r with pattern %s' % (searchUrl, search))
+            if searchUrl is not None:
+                searchUrls.append(searchUrl)
+
+        if not searchUrls:
+            raise ValueError("XPath %s not found at URL %s." %
+                             (urlSearch, url))
+        return searchUrls
+
+    def fetchText(self, url, data, textSearch, optional):
+        """Search text entry for given text XPath in a HTML page."""
+        if not textSearch:
+            return None
+        text = []
+        for match, search in self._matchPattern(data, textSearch):
+            try:
+                text.append(match.text_content())
+            except AttributeError:
+                text.append(match)
+            out.debug(u'Matched text %r with XPath %s' % (text, search))
+        text = u' '.join(text)
+        if text.strip() == '':
+            if optional:
+                return None
+            else:
+                raise ValueError("XPath %s did not match anything at URL %s." %
+                                 (textSearch, url))
+        return text.strip()
+
+    def _matchPattern(self, data, patterns):
         if self.css:
             searchFun = data.cssselect
         else:
             def searchFun(s):
                 return data.xpath(s, namespaces=self.NS)
-        searches = makeSequence(urlSearch)
-        for search in searches:
+        patterns = makeSequence(patterns)
+        for search in patterns:
+            matched = False
             for match in searchFun(search):
-                try:
-                    for attrib in html_link_attrs:
-                        if attrib in match.attrib:
-                            searchUrl = match.get(attrib)
-                except AttributeError:
-                    searchUrl = str(match)
-                out.debug(u'Matched URL %r with pattern %s' %
-                          (searchUrl, search))
-                searchUrls.append(searchUrl)
+                matched = True
+                yield match, search
 
-            if not self.multipleImagesPerStrip and searchUrls:
+            if matched and not self.multipleImagesPerStrip:
                 # do not search other links if one pattern matched
                 break
-        if not searchUrls:
-            raise ValueError("XPath %s not found at URL %s." % (searches, url))
-        return searchUrls
-
-    def fetchText(self, url, data, textSearch, optional):
-        """Search text entry for given text XPath in a HTML page."""
-        if self.css:
-            searchFun = data.cssselect
-        else:
-            searchFun = data.xpath
-        if textSearch:
-            text = ''
-            for match in searchFun(textSearch):
-                try:
-                    text += u' ' + match.text_content()
-                except AttributeError:
-                    text += u' ' + match
-            if text.strip() == '':
-                if optional:
-                    return None
-                else:
-                    raise ValueError(
-                        "XPath %s did not match anything at URL %s." %
-                        (textSearch, url))
-            out.debug(u'Matched text %r with XPath %s' % (text, textSearch))
-            return unescape(text).strip()
-        else:
-            return None
 
     def getDisabledReasons(self):
         res = {}