Fix "tagsoup" on SmackJeeves
Unfortunatly, browsers render < outside of HTML tags differently then libXML until recently (libXML 2.9.3), so we need to preprocess pages before parsing them... (This was fixed in libXML commit 140c25)
This commit is contained in:
parent
00fb51341a
commit
8b1ac4eb35
2 changed files with 25 additions and 4 deletions
|
@ -5,14 +5,20 @@
|
|||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import re
|
||||
|
||||
from ..util import quote
|
||||
from ..scraper import _ParserScraper
|
||||
from ..output import out
|
||||
|
||||
# SmackJeeves is a crawlers nightmare - users are allowed to edit HTML
|
||||
# directly.
|
||||
# directly. Additionally, users use unescaped < characters sometimes, which
|
||||
# breaks the parse tree on libxml2 before 2.9.3...
|
||||
|
||||
|
||||
class _SmackJeeves(_ParserScraper):
|
||||
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
|
||||
|
||||
ONLY_COMICS = '[contains(@href, "/comics/")]'
|
||||
|
||||
prevSearch = (
|
||||
|
@ -51,6 +57,17 @@ class _SmackJeeves(_ParserScraper):
|
|||
else:
|
||||
return 'http://%s.smackjeeves.com/comics/' % self.sub
|
||||
|
||||
def _parse_page(self, data):
|
||||
import lxml.etree
|
||||
if lxml.etree.LIBXML_VERSION < (2, 9, 3):
|
||||
def fix_not_open_tags(match):
|
||||
fix = (len(match.group(1)) * '<') + match.group(2)
|
||||
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
||||
match.group(0), fix), level=2)
|
||||
return fix
|
||||
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
||||
return super(_SmackJeeves, self)._parse_page(data)
|
||||
|
||||
def starter(self):
|
||||
"""Get start URL."""
|
||||
start = self.url
|
||||
|
@ -66,7 +83,7 @@ class _SmackJeeves(_ParserScraper):
|
|||
if not self.shouldSkipUrl(prevurl, data):
|
||||
previmg = self.fetchUrl(prevurl, data, self.imageSearch)
|
||||
if startimg and previmg and startimg == previmg:
|
||||
print("Matching! %s %s" % (prevurl, self.name))
|
||||
out.debug("Matching! %s %s" % (prevurl, self.name))
|
||||
return prevurl
|
||||
else:
|
||||
return self.fetchUrl(prevurl, data, self.nextSearch)
|
||||
|
|
|
@ -436,12 +436,16 @@ class _ParserScraper(Scraper):
|
|||
# document. Web browsers ignore such if the encoding was specified
|
||||
# in the HTTP header and so do we.
|
||||
text = self.XML_DECL.sub('\1\2', page.text, count=1)
|
||||
tree = html.document_fromstring(text)
|
||||
tree = self._parse_page(text)
|
||||
else:
|
||||
tree = html.document_fromstring(page.content)
|
||||
tree = self._parse_page(page.content)
|
||||
tree.make_links_absolute(url)
|
||||
return tree
|
||||
|
||||
def _parse_page(self, data):
|
||||
tree = html.document_fromstring(data)
|
||||
return tree
|
||||
|
||||
def fetchUrls(self, url, data, urlSearch):
|
||||
"""Search all entries for given XPath in a HTML page."""
|
||||
searchUrls = []
|
||||
|
|
Loading…
Reference in a new issue