Create new HTML parser based scraper class.

2014-07-23 20:54:00 +02:00 · 2014-07-23 20:54:00 +02:00 · f9f0b75d7c
commit f9f0b75d7c
parent fcde86e9c0
2 changed files with 80 additions and 0 deletions
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -372,6 +372,85 @@ class _BasicScraper(Scraper):
            return None


+class _ParserScraper(Scraper):
+    """
+    Scraper base class that uses a HTML parser and XPath expressions.
+
+    All links are resolved before XPath searches are applied, so all URLs are
+    absolute!
+
+    Subclasses of this class should use XPath expressions as values for
+    prevSearch, imageSearch and textSearch. When the XPath directly selects an
+    attribute, it is used as the output.
+
+    All those searches try to do something intelligent when they match a
+    complete HTML Element: prevSearch and imageSearch try to find a "link
+    attribute" and use that as URL. textSearch strips all tags from the content
+    of the HTML element and returns that.
+    """
+
+    @classmethod
+    def xpath(cls, expr):
+        return expr
+
+    @classmethod
+    def css(cls, expr, attr=None):
+        return expr
+
+    @classmethod
+    def getPage(cls, url):
+        try:
+            from lxml import html
+        except ImportError:
+            raise ValueError(u"Skipping comic %s: Needs lxml (python-lxml) installed." % cls.getName())
+        from lxml.html.defs import link_attrs
+        cls.link_attrs = link_attrs
+        cls.html = html
+        tree = html.document_fromstring(getPageContent(url, cls.session, raw_data=True))
+        tree.make_links_absolute(url)
+        return tree
+
+    @classmethod
+    def fetchUrls(cls, url, data, urlSearch):
+        """Search all entries for given XPath in a HTML page."""
+        searchUrls = []
+        searches = makeSequence(urlSearch)
+        for search in searches:
+            for match in data.xpath(search):
+                try:
+                    for attrib in cls.link_attrs:
+                        if attrib in match.attrib:
+                            searchUrls.append(match.get(attrib))
+                except AttributeError:
+                    searchUrls.append(str(match))
+            if searchUrls:
+                # do not search other links if one pattern matched
+                break
+        if not searchUrls:
+            raise ValueError("XPath %s not found at URL %s." % (searches, url))
+        return searchUrls
+
+    @classmethod
+    def fetchText(cls, url, data, textSearch, optional):
+        """Search text entry for given text XPath in a HTML page."""
+        if textSearch:
+            text = ''
+            for match in data.xpath(textSearch):
+                try:
+                    text += ' ' + match.text_content()
+                except AttributeError:
+                    text += ' ' + unicode(match)
+            if text.strip() == '':
+                if optional:
+                    return None
+                else:
+                    raise ValueError("XPath %s did not match anything at URL %s." % (textSearch, url))
+            out.debug(u'Matched text %r with XPath %s' % (text, textSearch))
+            return unescape(text).strip()
+        else:
+            return None
+
+
 def find_scraperclasses(comic, multiple_allowed=False):
    """Get a list comic scraper classes. Can return more than one entries if
    multiple_allowed is True, else it raises a ValueError if multiple
--- a/requirements.txt
+++ b/requirements.txt
@ -2,3 +2,4 @@
 requests
 # optional:
 argcomplete
+lxml