Fix AhoiPolloi, be a bit smarter about encoding.

HTML character encoding in the context of HTTP is quite tricky to get
right and honestly, I'm not sure if I did get it right this time. But I
think, the current behaviour matches best what web browsers try to do:

1. Let Requests figure out the content from the HTTP header. This
   overrides everything else. We need to "trick" LXML to accept our
   decision if the document contains an XML declaration which might
   disagree with the HTTP header.
2. If the HTTP headers don't specify any encoding, let LXML guess the
   encoding and be done with it.
This commit is contained in:
Tobias Gruetzmacher 2016-04-06 22:22:22 +02:00
parent 183d18e7bc
commit 8768ff07b6
3 changed files with 29 additions and 22 deletions

View file

@ -88,22 +88,16 @@ class AGirlAndHerFed(_BasicScraper):
help = 'Index format: nnn' help = 'Index format: nnn'
class AhoiPolloi(_BasicScraper): class AhoiPolloi(_ParserScraper):
url = 'http://ahoipolloi.blogger.de/' url = 'http://ahoipolloi.blogger.de/'
stripUrl = url + '?day=%s' stripUrl = url + '?day=%s'
firstStripUrl = stripUrl % '20060306' firstStripUrl = stripUrl % '20060306'
multipleImagesPerStrip = True multipleImagesPerStrip = True
lang = 'de' lang = 'de'
imageSearch = compile(tagre('img', 'src', imageSearch = '//img[contains(@src, "/static/antville/ahoipolloi/")]'
r'(/static/antville/ahoipolloi/images/[^"]+)')) prevSearch = '//a[contains(@href, "/?day=")]'
prevSearch = compile(tagre('a', 'href',
r'(http://ahoipolloi\.blogger\.de/\?day=\d+)'))
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@classmethod
def namer(cls, imageUrl, pageUrl):
return imageUrl.rsplit('/', 1)[1]
class AhoyEarth(_ParserScraper): class AhoyEarth(_ParserScraper):
url = 'http://www.ahoyearth.com/' url = 'http://www.ahoyearth.com/'

View file

@ -29,8 +29,8 @@ except ImportError:
pycountry = None pycountry = None
from . import loader, configuration, languages from . import loader, configuration, languages
from .util import (getPageContent, makeSequence, get_system_uid, urlopen, from .util import (get_page, makeSequence, get_system_uid, urlopen, getDirname,
getDirname, unescape, tagre, normaliseURL, unescape, tagre, normaliseURL,
prettyMatcherList, requests_session) prettyMatcherList, requests_session)
from .comic import ComicStrip from .comic import ComicStrip
from .output import out from .output import out
@ -361,7 +361,7 @@ class _BasicScraper(Scraper):
@classmethod @classmethod
def getPage(cls, url): def getPage(cls, url):
content = getPageContent(url, cls.session) content = get_page(url, cls.session).text
# determine base URL # determine base URL
baseUrl = None baseUrl = None
match = cls.BASE_SEARCH.search(content) match = cls.BASE_SEARCH.search(content)
@ -430,13 +430,27 @@ class _ParserScraper(Scraper):
of the HTML element and returns that. of the HTML element and returns that.
""" """
# Taken directly from LXML
XML_DECL = re.compile(
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
# Switch between CSS and XPath selectors for this class. Since CSS needs # Switch between CSS and XPath selectors for this class. Since CSS needs
# another Python module, XPath is the default for now. # another Python module, XPath is the default for now.
css = False css = False
@classmethod @classmethod
def getPage(cls, url): def getPage(cls, url):
tree = html.document_fromstring(getPageContent(url, cls.session)) page = get_page(url, cls.session)
if page.encoding:
# Requests figured out the encoding, so we can deliver Unicode to
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML
# declaration with (probably wrong!) encoding at the top of the
# document. Web browsers ignore such if the encoding was specified
# in the HTTP header and so do we.
text = cls.XML_DECL.sub('\1\2', page.text, count=1)
tree = html.document_fromstring(text)
else:
tree = html.document_fromstring(page.content)
tree.make_links_absolute(url) tree.make_links_absolute(url)
return tree return tree

View file

@ -185,14 +185,13 @@ def case_insensitive_re(name):
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name) return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
def getPageContent(url, session, max_content_bytes=MaxContentBytes): def get_page(url, session, max_content_bytes=MaxContentBytes):
"""Get text content of given URL.""" """Get text content of given URL."""
check_robotstxt(url, session) check_robotstxt(url, session)
# read page data # read page data
page = urlopen(url, session, max_content_bytes=max_content_bytes) page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text out.debug(u"Got page content %r" % page.content, level=3)
out.debug(u"Got page content %r" % data, level=3) return page
return data
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes): def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):