Move libxml < 2.9.3 workaround to base class.
This commit is contained in:
parent
b93a8fde65
commit
0c1aa9e8bd
3 changed files with 20 additions and 23 deletions
|
@ -30,6 +30,7 @@ class BadMachinery(_ParserScraper):
|
||||||
firstStripUrl = stripUrl % '20090918'
|
firstStripUrl = stripUrl % '20090918'
|
||||||
imageSearch = '//img[@class="comicimg"]'
|
imageSearch = '//img[@class="comicimg"]'
|
||||||
prevSearch = '//a[contains(text(), "Previous")]'
|
prevSearch = '//a[contains(text(), "Previous")]'
|
||||||
|
broken_html_bugfix = True
|
||||||
help = 'Index format: yyyymmdd'
|
help = 'Index format: yyyymmdd'
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,8 +5,6 @@
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from ..util import quote
|
from ..util import quote
|
||||||
from ..scraper import _ParserScraper
|
from ..scraper import _ParserScraper
|
||||||
from ..output import out
|
from ..output import out
|
||||||
|
@ -17,8 +15,6 @@ from ..output import out
|
||||||
|
|
||||||
|
|
||||||
class _SmackJeeves(_ParserScraper):
|
class _SmackJeeves(_ParserScraper):
|
||||||
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
|
|
||||||
|
|
||||||
ONLY_COMICS = '[contains(@href, "/comics/")]'
|
ONLY_COMICS = '[contains(@href, "/comics/")]'
|
||||||
|
|
||||||
prevSearch = (
|
prevSearch = (
|
||||||
|
@ -46,6 +42,8 @@ class _SmackJeeves(_ParserScraper):
|
||||||
'//div[@id="comicset"]/object/param[@name="movie"]/@value',
|
'//div[@id="comicset"]/object/param[@name="movie"]/@value',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
broken_html_bugfix = True
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def name(self):
|
def name(self):
|
||||||
return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:]
|
return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:]
|
||||||
|
@ -57,17 +55,6 @@ class _SmackJeeves(_ParserScraper):
|
||||||
else:
|
else:
|
||||||
return 'http://%s.smackjeeves.com/comics/' % self.sub
|
return 'http://%s.smackjeeves.com/comics/' % self.sub
|
||||||
|
|
||||||
def _parse_page(self, data):
|
|
||||||
import lxml.etree
|
|
||||||
if lxml.etree.LIBXML_VERSION < (2, 9, 3):
|
|
||||||
def fix_not_open_tags(match):
|
|
||||||
fix = (len(match.group(1)) * '<') + match.group(2)
|
|
||||||
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
|
||||||
match.group(0), fix), level=2)
|
|
||||||
return fix
|
|
||||||
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
|
||||||
return super(_SmackJeeves, self)._parse_page(data)
|
|
||||||
|
|
||||||
def starter(self):
|
def starter(self):
|
||||||
"""Get start URL."""
|
"""Get start URL."""
|
||||||
start = self.url
|
start = self.url
|
||||||
|
|
|
@ -14,11 +14,8 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urlparse import urljoin
|
from urlparse import urljoin
|
||||||
|
|
||||||
try:
|
from lxml import html, etree
|
||||||
from lxml import html
|
from lxml.html.defs import link_attrs as html_link_attrs
|
||||||
from lxml.html.defs import link_attrs as html_link_attrs
|
|
||||||
except ImportError:
|
|
||||||
html = None
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cssselect
|
import cssselect
|
||||||
|
@ -415,6 +412,8 @@ class _ParserScraper(Scraper):
|
||||||
of the HTML element and returns that.
|
of the HTML element and returns that.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
|
||||||
|
|
||||||
# Taken directly from LXML
|
# Taken directly from LXML
|
||||||
XML_DECL = re.compile(
|
XML_DECL = re.compile(
|
||||||
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
||||||
|
@ -427,6 +426,11 @@ class _ParserScraper(Scraper):
|
||||||
# another Python module, XPath is the default for now.
|
# another Python module, XPath is the default for now.
|
||||||
css = False
|
css = False
|
||||||
|
|
||||||
|
# Activate a workaround for unescaped < characters on libxml version older
|
||||||
|
# then 2.9.3. This is disabled by default since most sites are not THAT
|
||||||
|
# broken ;)
|
||||||
|
broken_html_bugfix = False
|
||||||
|
|
||||||
def getPage(self, url):
|
def getPage(self, url):
|
||||||
page = get_page(url, self.session)
|
page = get_page(url, self.session)
|
||||||
if page.encoding:
|
if page.encoding:
|
||||||
|
@ -443,6 +447,14 @@ class _ParserScraper(Scraper):
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
def _parse_page(self, data):
|
def _parse_page(self, data):
|
||||||
|
if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
|
||||||
|
def fix_not_open_tags(match):
|
||||||
|
fix = (len(match.group(1)) * '<') + match.group(2)
|
||||||
|
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
||||||
|
match.group(0), fix), level=2)
|
||||||
|
return fix
|
||||||
|
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
||||||
|
|
||||||
tree = html.document_fromstring(data)
|
tree = html.document_fromstring(data)
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
|
@ -509,9 +521,6 @@ class _ParserScraper(Scraper):
|
||||||
res['css'] = (u"This module needs the cssselect " +
|
res['css'] = (u"This module needs the cssselect " +
|
||||||
u"(python-cssselect) python module which is " +
|
u"(python-cssselect) python module which is " +
|
||||||
u"not installed.")
|
u"not installed.")
|
||||||
if html is None:
|
|
||||||
res['lxml'] = (u"This module needs the lxml (python-lxml) " +
|
|
||||||
u"python module which is not installed.")
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue