Move libxml < 2.9.3 workaround to base class.

This commit is contained in:
Tobias Gruetzmacher 2016-05-02 23:22:06 +02:00
parent b93a8fde65
commit 0c1aa9e8bd
3 changed files with 20 additions and 23 deletions

View file

@ -30,6 +30,7 @@ class BadMachinery(_ParserScraper):
firstStripUrl = stripUrl % '20090918' firstStripUrl = stripUrl % '20090918'
imageSearch = '//img[@class="comicimg"]' imageSearch = '//img[@class="comicimg"]'
prevSearch = '//a[contains(text(), "Previous")]' prevSearch = '//a[contains(text(), "Previous")]'
broken_html_bugfix = True
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'

View file

@ -5,8 +5,6 @@
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import re
from ..util import quote from ..util import quote
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
from ..output import out from ..output import out
@ -17,8 +15,6 @@ from ..output import out
class _SmackJeeves(_ParserScraper): class _SmackJeeves(_ParserScraper):
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
ONLY_COMICS = '[contains(@href, "/comics/")]' ONLY_COMICS = '[contains(@href, "/comics/")]'
prevSearch = ( prevSearch = (
@ -46,6 +42,8 @@ class _SmackJeeves(_ParserScraper):
'//div[@id="comicset"]/object/param[@name="movie"]/@value', '//div[@id="comicset"]/object/param[@name="movie"]/@value',
) )
broken_html_bugfix = True
@property @property
def name(self): def name(self):
return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:] return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:]
@ -57,17 +55,6 @@ class _SmackJeeves(_ParserScraper):
else: else:
return 'http://%s.smackjeeves.com/comics/' % self.sub return 'http://%s.smackjeeves.com/comics/' % self.sub
def _parse_page(self, data):
import lxml.etree
if lxml.etree.LIBXML_VERSION < (2, 9, 3):
def fix_not_open_tags(match):
fix = (len(match.group(1)) * '&lt;') + match.group(2)
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
match.group(0), fix), level=2)
return fix
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
return super(_SmackJeeves, self)._parse_page(data)
def starter(self): def starter(self):
"""Get start URL.""" """Get start URL."""
start = self.url start = self.url

View file

@ -14,11 +14,8 @@ try:
except ImportError: except ImportError:
from urlparse import urljoin from urlparse import urljoin
try: from lxml import html, etree
from lxml import html from lxml.html.defs import link_attrs as html_link_attrs
from lxml.html.defs import link_attrs as html_link_attrs
except ImportError:
html = None
try: try:
import cssselect import cssselect
@ -415,6 +412,8 @@ class _ParserScraper(Scraper):
of the HTML element and returns that. of the HTML element and returns that.
""" """
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
# Taken directly from LXML # Taken directly from LXML
XML_DECL = re.compile( XML_DECL = re.compile(
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
@ -427,6 +426,11 @@ class _ParserScraper(Scraper):
# another Python module, XPath is the default for now. # another Python module, XPath is the default for now.
css = False css = False
# Activate a workaround for unescaped < characters on libxml version older
# then 2.9.3. This is disabled by default since most sites are not THAT
# broken ;)
broken_html_bugfix = False
def getPage(self, url): def getPage(self, url):
page = get_page(url, self.session) page = get_page(url, self.session)
if page.encoding: if page.encoding:
@ -443,6 +447,14 @@ class _ParserScraper(Scraper):
return tree return tree
def _parse_page(self, data): def _parse_page(self, data):
if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
def fix_not_open_tags(match):
fix = (len(match.group(1)) * '&lt;') + match.group(2)
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
match.group(0), fix), level=2)
return fix
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
tree = html.document_fromstring(data) tree = html.document_fromstring(data)
return tree return tree
@ -509,9 +521,6 @@ class _ParserScraper(Scraper):
res['css'] = (u"This module needs the cssselect " + res['css'] = (u"This module needs the cssselect " +
u"(python-cssselect) python module which is " + u"(python-cssselect) python module which is " +
u"not installed.") u"not installed.")
if html is None:
res['lxml'] = (u"This module needs the lxml (python-lxml) " +
u"python module which is not installed.")
return res return res