Fix "tagsoup" on SmackJeeves

Unfortunatly, browsers render < outside of HTML tags differently then libXML until recently (libXML 2.9.3), so we need to preprocess pages before parsing them... (This was fixed in libXML commit 140c25)
2016-04-26 08:05:38 +02:00 · 2016-04-26 08:05:38 +02:00 · 8b1ac4eb35
commit 8b1ac4eb35
parent 00fb51341a
2 changed files with 25 additions and 4 deletions
--- a/dosagelib/plugins/smackjeeves.py
+++ b/dosagelib/plugins/smackjeeves.py
@ -5,14 +5,20 @@

 from __future__ import absolute_import, division, print_function

+import re
+
 from ..util import quote
 from ..scraper import _ParserScraper
+from ..output import out

 # SmackJeeves is a crawlers nightmare - users are allowed to edit HTML
-# directly.
+# directly. Additionally, users use unescaped < characters sometimes, which
+# breaks the parse tree on libxml2 before 2.9.3...


 class _SmackJeeves(_ParserScraper):
+    BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
+
    ONLY_COMICS = '[contains(@href, "/comics/")]'

    prevSearch = (
@ -51,6 +57,17 @@ class _SmackJeeves(_ParserScraper):
        else:
            return 'http://%s.smackjeeves.com/comics/' % self.sub

+    def _parse_page(self, data):
+        import lxml.etree
+        if lxml.etree.LIBXML_VERSION < (2, 9, 3):
+            def fix_not_open_tags(match):
+                fix = (len(match.group(1)) * '&lt;') + match.group(2)
+                out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
+                         match.group(0), fix), level=2)
+                return fix
+            data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
+        return super(_SmackJeeves, self)._parse_page(data)
+
    def starter(self):
        """Get start URL."""
        start = self.url
@ -66,7 +83,7 @@ class _SmackJeeves(_ParserScraper):
        if not self.shouldSkipUrl(prevurl, data):
            previmg = self.fetchUrl(prevurl, data, self.imageSearch)
        if startimg and previmg and startimg == previmg:
-            print("Matching! %s %s" % (prevurl, self.name))
+            out.debug("Matching! %s %s" % (prevurl, self.name))
            return prevurl
        else:
            return self.fetchUrl(prevurl, data, self.nextSearch)
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -436,12 +436,16 @@ class _ParserScraper(Scraper):
            # document. Web browsers ignore such if the encoding was specified
            # in the HTTP header and so do we.
            text = self.XML_DECL.sub('\1\2', page.text, count=1)
-            tree = html.document_fromstring(text)
+            tree = self._parse_page(text)
        else:
-            tree = html.document_fromstring(page.content)
+            tree = self._parse_page(page.content)
        tree.make_links_absolute(url)
        return tree

+    def _parse_page(self, data):
+        tree = html.document_fromstring(data)
+        return tree
+
    def fetchUrls(self, url, data, urlSearch):
        """Search all entries for given XPath in a HTML page."""
        searchUrls = []