Fix "tagsoup" on SmackJeeves

Unfortunatly, browsers render < outside of HTML tags differently then
libXML until recently (libXML 2.9.3), so we need to preprocess pages
before parsing them...

(This was fixed in libXML commit 140c25)
This commit is contained in:
Tobias Gruetzmacher 2016-04-26 08:05:38 +02:00
parent 00fb51341a
commit 8b1ac4eb35
2 changed files with 25 additions and 4 deletions

View file

@ -5,14 +5,20 @@
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import re
from ..util import quote from ..util import quote
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
from ..output import out
# SmackJeeves is a crawlers nightmare - users are allowed to edit HTML # SmackJeeves is a crawlers nightmare - users are allowed to edit HTML
# directly. # directly. Additionally, users use unescaped < characters sometimes, which
# breaks the parse tree on libxml2 before 2.9.3...
class _SmackJeeves(_ParserScraper): class _SmackJeeves(_ParserScraper):
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
ONLY_COMICS = '[contains(@href, "/comics/")]' ONLY_COMICS = '[contains(@href, "/comics/")]'
prevSearch = ( prevSearch = (
@ -51,6 +57,17 @@ class _SmackJeeves(_ParserScraper):
else: else:
return 'http://%s.smackjeeves.com/comics/' % self.sub return 'http://%s.smackjeeves.com/comics/' % self.sub
def _parse_page(self, data):
import lxml.etree
if lxml.etree.LIBXML_VERSION < (2, 9, 3):
def fix_not_open_tags(match):
fix = (len(match.group(1)) * '&lt;') + match.group(2)
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
match.group(0), fix), level=2)
return fix
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
return super(_SmackJeeves, self)._parse_page(data)
def starter(self): def starter(self):
"""Get start URL.""" """Get start URL."""
start = self.url start = self.url
@ -66,7 +83,7 @@ class _SmackJeeves(_ParserScraper):
if not self.shouldSkipUrl(prevurl, data): if not self.shouldSkipUrl(prevurl, data):
previmg = self.fetchUrl(prevurl, data, self.imageSearch) previmg = self.fetchUrl(prevurl, data, self.imageSearch)
if startimg and previmg and startimg == previmg: if startimg and previmg and startimg == previmg:
print("Matching! %s %s" % (prevurl, self.name)) out.debug("Matching! %s %s" % (prevurl, self.name))
return prevurl return prevurl
else: else:
return self.fetchUrl(prevurl, data, self.nextSearch) return self.fetchUrl(prevurl, data, self.nextSearch)

View file

@ -436,12 +436,16 @@ class _ParserScraper(Scraper):
# document. Web browsers ignore such if the encoding was specified # document. Web browsers ignore such if the encoding was specified
# in the HTTP header and so do we. # in the HTTP header and so do we.
text = self.XML_DECL.sub('\1\2', page.text, count=1) text = self.XML_DECL.sub('\1\2', page.text, count=1)
tree = html.document_fromstring(text) tree = self._parse_page(text)
else: else:
tree = html.document_fromstring(page.content) tree = self._parse_page(page.content)
tree.make_links_absolute(url) tree.make_links_absolute(url)
return tree return tree
def _parse_page(self, data):
tree = html.document_fromstring(data)
return tree
def fetchUrls(self, url, data, urlSearch): def fetchUrls(self, url, data, urlSearch):
"""Search all entries for given XPath in a HTML page.""" """Search all entries for given XPath in a HTML page."""
searchUrls = [] searchUrls = []