Move libxml < 2.9.3 workaround to base class.

2016-05-02 23:22:06 +02:00 · 2016-05-02 23:22:06 +02:00 · 0c1aa9e8bd
commit 0c1aa9e8bd
parent b93a8fde65
3 changed files with 20 additions and 23 deletions
--- a/dosagelib/plugins/b.py
+++ b/dosagelib/plugins/b.py
@ -30,6 +30,7 @@ class BadMachinery(_ParserScraper):
    firstStripUrl = stripUrl % '20090918'
    imageSearch = '//img[@class="comicimg"]'
    prevSearch = '//a[contains(text(), "Previous")]'
    broken_html_bugfix = True
    help = 'Index format: yyyymmdd'
--- a/dosagelib/plugins/smackjeeves.py
+++ b/dosagelib/plugins/smackjeeves.py
@ -5,8 +5,6 @@
 from __future__ import absolute_import, division, print_function
 import re
 from ..util import quote
 from ..scraper import _ParserScraper
 from ..output import out
@ -17,8 +15,6 @@ from ..output import out
 class _SmackJeeves(_ParserScraper):
    BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
    ONLY_COMICS = '[contains(@href, "/comics/")]'
    prevSearch = (
@ -46,6 +42,8 @@ class _SmackJeeves(_ParserScraper):
        '//div[@id="comicset"]/object/param[@name="movie"]/@value',
    )
    broken_html_bugfix = True
    @property
    def name(self):
        return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:]
@ -57,17 +55,6 @@ class _SmackJeeves(_ParserScraper):
        else:
            return 'http://%s.smackjeeves.com/comics/' % self.sub
    def _parse_page(self, data):
        import lxml.etree
        if lxml.etree.LIBXML_VERSION < (2, 9, 3):
            def fix_not_open_tags(match):
                fix = (len(match.group(1)) * '&lt;') + match.group(2)
                out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
                         match.group(0), fix), level=2)
                return fix
            data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
        return super(_SmackJeeves, self)._parse_page(data)
    def starter(self):
        """Get start URL."""
        start = self.url
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -14,11 +14,8 @@ try:
 except ImportError:
    from urlparse import urljoin
-try:
+from lxml import html, etree
-    from lxml import html
+from lxml.html.defs import link_attrs as html_link_attrs
    from lxml.html.defs import link_attrs as html_link_attrs
 except ImportError:
    html = None
 try:
    import cssselect
@ -415,6 +412,8 @@ class _ParserScraper(Scraper):
    of the HTML element and returns that.
    """
    BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
    # Taken directly from LXML
    XML_DECL = re.compile(
        r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
@ -427,6 +426,11 @@ class _ParserScraper(Scraper):
    # another Python module, XPath is the default for now.
    css = False
    # Activate a workaround for unescaped < characters on libxml version older
    # then 2.9.3. This is disabled by default since most sites are not THAT
    # broken ;)
    broken_html_bugfix = False
    def getPage(self, url):
        page = get_page(url, self.session)
        if page.encoding:
@ -443,6 +447,14 @@ class _ParserScraper(Scraper):
        return tree
    def _parse_page(self, data):
        if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
            def fix_not_open_tags(match):
                fix = (len(match.group(1)) * '&lt;') + match.group(2)
                out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
                         match.group(0), fix), level=2)
                return fix
            data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
        tree = html.document_fromstring(data)
        return tree
@ -509,9 +521,6 @@ class _ParserScraper(Scraper):
            res['css'] = (u"This module needs the cssselect " +
                          u"(python-cssselect) python module which is " +
                          u"not installed.")
        if html is None:
            res['lxml'] = (u"This module needs the lxml (python-lxml) " +
                           u"python module which is not installed.")
        return res