From f9f0b75d7cd31c3790c9d913d8b1529f2af82a62 Mon Sep 17 00:00:00 2001
From: Tobias Gruetzmacher <tobias-git@23.gs>
Date: Wed, 23 Jul 2014 20:54:00 +0200
Subject: [PATCH] Create new HTML parser based scraper class.

---
 dosagelib/scraper.py | 79 ++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt     |  1 +
 2 files changed, 80 insertions(+)

diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py
index ef9541a4e..7a9bb9972 100644
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@@ -372,6 +372,85 @@ class _BasicScraper(Scraper):
             return None
 
 
+class _ParserScraper(Scraper):
+    """
+    Scraper base class that uses a HTML parser and XPath expressions.
+
+    All links are resolved before XPath searches are applied, so all URLs are
+    absolute!
+
+    Subclasses of this class should use XPath expressions as values for
+    prevSearch, imageSearch and textSearch. When the XPath directly selects an
+    attribute, it is used as the output.
+
+    All those searches try to do something intelligent when they match a
+    complete HTML Element: prevSearch and imageSearch try to find a "link
+    attribute" and use that as URL. textSearch strips all tags from the content
+    of the HTML element and returns that.
+    """
+
+    @classmethod
+    def xpath(cls, expr):
+        return expr
+
+    @classmethod
+    def css(cls, expr, attr=None):
+        return expr
+
+    @classmethod
+    def getPage(cls, url):
+        try:
+            from lxml import html
+        except ImportError:
+            raise ValueError(u"Skipping comic %s: Needs lxml (python-lxml) installed." % cls.getName())
+        from lxml.html.defs import link_attrs
+        cls.link_attrs = link_attrs
+        cls.html = html
+        tree = html.document_fromstring(getPageContent(url, cls.session, raw_data=True))
+        tree.make_links_absolute(url)
+        return tree
+
+    @classmethod
+    def fetchUrls(cls, url, data, urlSearch):
+        """Search all entries for given XPath in a HTML page."""
+        searchUrls = []
+        searches = makeSequence(urlSearch)
+        for search in searches:
+            for match in data.xpath(search):
+                try:
+                    for attrib in cls.link_attrs:
+                        if attrib in match.attrib:
+                            searchUrls.append(match.get(attrib))
+                except AttributeError:
+                    searchUrls.append(str(match))
+            if searchUrls:
+                # do not search other links if one pattern matched
+                break
+        if not searchUrls:
+            raise ValueError("XPath %s not found at URL %s." % (searches, url))
+        return searchUrls
+
+    @classmethod
+    def fetchText(cls, url, data, textSearch, optional):
+        """Search text entry for given text XPath in a HTML page."""
+        if textSearch:
+            text = ''
+            for match in data.xpath(textSearch):
+                try:
+                    text += ' ' + match.text_content()
+                except AttributeError:
+                    text += ' ' + unicode(match)
+            if text.strip() == '':
+                if optional:
+                    return None
+                else:
+                    raise ValueError("XPath %s did not match anything at URL %s." % (textSearch, url))
+            out.debug(u'Matched text %r with XPath %s' % (text, textSearch))
+            return unescape(text).strip()
+        else:
+            return None
+
+
 def find_scraperclasses(comic, multiple_allowed=False):
     """Get a list comic scraper classes. Can return more than one entries if
     multiple_allowed is True, else it raises a ValueError if multiple
diff --git a/requirements.txt b/requirements.txt
index a37178243..d43beb4bd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@
 requests
 # optional:
 argcomplete
+lxml