From 0c1aa9e8bd72f4955d197306db407b5870edde5a Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Mon, 2 May 2016 23:22:06 +0200 Subject: [PATCH] Move libxml < 2.9.3 workaround to base class. --- dosagelib/plugins/b.py | 1 + dosagelib/plugins/smackjeeves.py | 17 ++--------------- dosagelib/scraper.py | 25 +++++++++++++++++-------- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py index 073802b73..b212f3250 100644 --- a/dosagelib/plugins/b.py +++ b/dosagelib/plugins/b.py @@ -30,6 +30,7 @@ class BadMachinery(_ParserScraper): firstStripUrl = stripUrl % '20090918' imageSearch = '//img[@class="comicimg"]' prevSearch = '//a[contains(text(), "Previous")]' + broken_html_bugfix = True help = 'Index format: yyyymmdd' diff --git a/dosagelib/plugins/smackjeeves.py b/dosagelib/plugins/smackjeeves.py index 24e74fb0e..fe14ace53 100644 --- a/dosagelib/plugins/smackjeeves.py +++ b/dosagelib/plugins/smackjeeves.py @@ -5,8 +5,6 @@ from __future__ import absolute_import, division, print_function -import re - from ..util import quote from ..scraper import _ParserScraper from ..output import out @@ -17,8 +15,6 @@ from ..output import out class _SmackJeeves(_ParserScraper): - BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])') - ONLY_COMICS = '[contains(@href, "/comics/")]' prevSearch = ( @@ -46,6 +42,8 @@ class _SmackJeeves(_ParserScraper): '//div[@id="comicset"]/object/param[@name="movie"]/@value', ) + broken_html_bugfix = True + @property def name(self): return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:] @@ -57,17 +55,6 @@ class _SmackJeeves(_ParserScraper): else: return 'http://%s.smackjeeves.com/comics/' % self.sub - def _parse_page(self, data): - import lxml.etree - if lxml.etree.LIBXML_VERSION < (2, 9, 3): - def fix_not_open_tags(match): - fix = (len(match.group(1)) * '<') + match.group(2) - out.warn("Found possibly broken HTML '%s', fixing as '%s'" % ( - match.group(0), fix), level=2) - return fix - data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data) - return super(_SmackJeeves, self)._parse_page(data) - def starter(self): """Get start URL.""" start = self.url diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 6c2f87e17..a15e10f51 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -14,11 +14,8 @@ try: except ImportError: from urlparse import urljoin -try: - from lxml import html - from lxml.html.defs import link_attrs as html_link_attrs -except ImportError: - html = None +from lxml import html, etree +from lxml.html.defs import link_attrs as html_link_attrs try: import cssselect @@ -415,6 +412,8 @@ class _ParserScraper(Scraper): of the HTML element and returns that. """ + BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])') + # Taken directly from LXML XML_DECL = re.compile( r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) @@ -427,6 +426,11 @@ class _ParserScraper(Scraper): # another Python module, XPath is the default for now. css = False + # Activate a workaround for unescaped < characters on libxml version older + # then 2.9.3. This is disabled by default since most sites are not THAT + # broken ;) + broken_html_bugfix = False + def getPage(self, url): page = get_page(url, self.session) if page.encoding: @@ -443,6 +447,14 @@ class _ParserScraper(Scraper): return tree def _parse_page(self, data): + if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3): + def fix_not_open_tags(match): + fix = (len(match.group(1)) * '<') + match.group(2) + out.warn("Found possibly broken HTML '%s', fixing as '%s'" % ( + match.group(0), fix), level=2) + return fix + data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data) + tree = html.document_fromstring(data) return tree @@ -509,9 +521,6 @@ class _ParserScraper(Scraper): res['css'] = (u"This module needs the cssselect " + u"(python-cssselect) python module which is " + u"not installed.") - if html is None: - res['lxml'] = (u"This module needs the lxml (python-lxml) " + - u"python module which is not installed.") return res