Move libxml < 2.9.3 workaround to base class.
This commit is contained in:
parent
b93a8fde65
commit
0c1aa9e8bd
3 changed files with 20 additions and 23 deletions
|
@ -30,6 +30,7 @@ class BadMachinery(_ParserScraper):
|
|||
firstStripUrl = stripUrl % '20090918'
|
||||
imageSearch = '//img[@class="comicimg"]'
|
||||
prevSearch = '//a[contains(text(), "Previous")]'
|
||||
broken_html_bugfix = True
|
||||
help = 'Index format: yyyymmdd'
|
||||
|
||||
|
||||
|
|
|
@ -5,8 +5,6 @@
|
|||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import re
|
||||
|
||||
from ..util import quote
|
||||
from ..scraper import _ParserScraper
|
||||
from ..output import out
|
||||
|
@ -17,8 +15,6 @@ from ..output import out
|
|||
|
||||
|
||||
class _SmackJeeves(_ParserScraper):
|
||||
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
|
||||
|
||||
ONLY_COMICS = '[contains(@href, "/comics/")]'
|
||||
|
||||
prevSearch = (
|
||||
|
@ -46,6 +42,8 @@ class _SmackJeeves(_ParserScraper):
|
|||
'//div[@id="comicset"]/object/param[@name="movie"]/@value',
|
||||
)
|
||||
|
||||
broken_html_bugfix = True
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:]
|
||||
|
@ -57,17 +55,6 @@ class _SmackJeeves(_ParserScraper):
|
|||
else:
|
||||
return 'http://%s.smackjeeves.com/comics/' % self.sub
|
||||
|
||||
def _parse_page(self, data):
|
||||
import lxml.etree
|
||||
if lxml.etree.LIBXML_VERSION < (2, 9, 3):
|
||||
def fix_not_open_tags(match):
|
||||
fix = (len(match.group(1)) * '<') + match.group(2)
|
||||
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
||||
match.group(0), fix), level=2)
|
||||
return fix
|
||||
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
||||
return super(_SmackJeeves, self)._parse_page(data)
|
||||
|
||||
def starter(self):
|
||||
"""Get start URL."""
|
||||
start = self.url
|
||||
|
|
|
@ -14,11 +14,8 @@ try:
|
|||
except ImportError:
|
||||
from urlparse import urljoin
|
||||
|
||||
try:
|
||||
from lxml import html
|
||||
from lxml.html.defs import link_attrs as html_link_attrs
|
||||
except ImportError:
|
||||
html = None
|
||||
from lxml import html, etree
|
||||
from lxml.html.defs import link_attrs as html_link_attrs
|
||||
|
||||
try:
|
||||
import cssselect
|
||||
|
@ -415,6 +412,8 @@ class _ParserScraper(Scraper):
|
|||
of the HTML element and returns that.
|
||||
"""
|
||||
|
||||
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
|
||||
|
||||
# Taken directly from LXML
|
||||
XML_DECL = re.compile(
|
||||
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
||||
|
@ -427,6 +426,11 @@ class _ParserScraper(Scraper):
|
|||
# another Python module, XPath is the default for now.
|
||||
css = False
|
||||
|
||||
# Activate a workaround for unescaped < characters on libxml version older
|
||||
# then 2.9.3. This is disabled by default since most sites are not THAT
|
||||
# broken ;)
|
||||
broken_html_bugfix = False
|
||||
|
||||
def getPage(self, url):
|
||||
page = get_page(url, self.session)
|
||||
if page.encoding:
|
||||
|
@ -443,6 +447,14 @@ class _ParserScraper(Scraper):
|
|||
return tree
|
||||
|
||||
def _parse_page(self, data):
|
||||
if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
|
||||
def fix_not_open_tags(match):
|
||||
fix = (len(match.group(1)) * '<') + match.group(2)
|
||||
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
||||
match.group(0), fix), level=2)
|
||||
return fix
|
||||
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
||||
|
||||
tree = html.document_fromstring(data)
|
||||
return tree
|
||||
|
||||
|
@ -509,9 +521,6 @@ class _ParserScraper(Scraper):
|
|||
res['css'] = (u"This module needs the cssselect " +
|
||||
u"(python-cssselect) python module which is " +
|
||||
u"not installed.")
|
||||
if html is None:
|
||||
res['lxml'] = (u"This module needs the lxml (python-lxml) " +
|
||||
u"python module which is not installed.")
|
||||
return res
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue