Move libxml < 2.9.3 workaround to base class.

2016-05-02 23:22:06 +02:00 · 2016-05-02 23:22:06 +02:00 · 0c1aa9e8bd
commit 0c1aa9e8bd
parent b93a8fde65
3 changed files with 20 additions and 23 deletions
--- a/dosagelib/plugins/b.py
+++ b/dosagelib/plugins/b.py
@ -30,6 +30,7 @@ class BadMachinery(_ParserScraper):
    firstStripUrl = stripUrl % '20090918'
    imageSearch = '//img[@class="comicimg"]'
    prevSearch = '//a[contains(text(), "Previous")]'
+    broken_html_bugfix = True
    help = 'Index format: yyyymmdd'


--- a/dosagelib/plugins/smackjeeves.py
+++ b/dosagelib/plugins/smackjeeves.py
@ -5,8 +5,6 @@

 from __future__ import absolute_import, division, print_function

-import re
-
 from ..util import quote
 from ..scraper import _ParserScraper
 from ..output import out
@ -17,8 +15,6 @@ from ..output import out


 class _SmackJeeves(_ParserScraper):
-    BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
-
    ONLY_COMICS = '[contains(@href, "/comics/")]'

    prevSearch = (
@ -46,6 +42,8 @@ class _SmackJeeves(_ParserScraper):
        '//div[@id="comicset"]/object/param[@name="movie"]/@value',
    )

+    broken_html_bugfix = True
+
    @property
    def name(self):
        return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:]
@ -57,17 +55,6 @@ class _SmackJeeves(_ParserScraper):
        else:
            return 'http://%s.smackjeeves.com/comics/' % self.sub

-    def _parse_page(self, data):
-        import lxml.etree
-        if lxml.etree.LIBXML_VERSION < (2, 9, 3):
-            def fix_not_open_tags(match):
-                fix = (len(match.group(1)) * '&lt;') + match.group(2)
-                out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
-                         match.group(0), fix), level=2)
-                return fix
-            data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
-        return super(_SmackJeeves, self)._parse_page(data)
-
    def starter(self):
        """Get start URL."""
        start = self.url
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -14,11 +14,8 @@ try:
 except ImportError:
    from urlparse import urljoin

-try:
-    from lxml import html
-    from lxml.html.defs import link_attrs as html_link_attrs
-except ImportError:
-    html = None
+from lxml import html, etree
+from lxml.html.defs import link_attrs as html_link_attrs

 try:
    import cssselect
@ -415,6 +412,8 @@ class _ParserScraper(Scraper):
    of the HTML element and returns that.
    """

+    BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
+
    # Taken directly from LXML
    XML_DECL = re.compile(
        r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
@ -427,6 +426,11 @@ class _ParserScraper(Scraper):
    # another Python module, XPath is the default for now.
    css = False

+    # Activate a workaround for unescaped < characters on libxml version older
+    # then 2.9.3. This is disabled by default since most sites are not THAT
+    # broken ;)
+    broken_html_bugfix = False
+
    def getPage(self, url):
        page = get_page(url, self.session)
        if page.encoding:
@ -443,6 +447,14 @@ class _ParserScraper(Scraper):
        return tree

    def _parse_page(self, data):
+        if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
+            def fix_not_open_tags(match):
+                fix = (len(match.group(1)) * '&lt;') + match.group(2)
+                out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
+                         match.group(0), fix), level=2)
+                return fix
+            data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
+
        tree = html.document_fromstring(data)
        return tree

@ -509,9 +521,6 @@ class _ParserScraper(Scraper):
            res['css'] = (u"This module needs the cssselect " +
                          u"(python-cssselect) python module which is " +
                          u"not installed.")
-        if html is None:
-            res['lxml'] = (u"This module needs the lxml (python-lxml) " +
-                           u"python module which is not installed.")
        return res