Fix AhoiPolloi, be a bit smarter about encoding.
HTML character encoding in the context of HTTP is quite tricky to get right and honestly, I'm not sure if I did get it right this time. But I think, the current behaviour matches best what web browsers try to do: 1. Let Requests figure out the content from the HTTP header. This overrides everything else. We need to "trick" LXML to accept our decision if the document contains an XML declaration which might disagree with the HTTP header. 2. If the HTTP headers don't specify any encoding, let LXML guess the encoding and be done with it.
This commit is contained in:
parent
183d18e7bc
commit
8768ff07b6
3 changed files with 29 additions and 22 deletions
|
@ -88,22 +88,16 @@ class AGirlAndHerFed(_BasicScraper):
|
|||
help = 'Index format: nnn'
|
||||
|
||||
|
||||
class AhoiPolloi(_BasicScraper):
|
||||
class AhoiPolloi(_ParserScraper):
|
||||
url = 'http://ahoipolloi.blogger.de/'
|
||||
stripUrl = url + '?day=%s'
|
||||
firstStripUrl = stripUrl % '20060306'
|
||||
multipleImagesPerStrip = True
|
||||
lang = 'de'
|
||||
imageSearch = compile(tagre('img', 'src',
|
||||
r'(/static/antville/ahoipolloi/images/[^"]+)'))
|
||||
prevSearch = compile(tagre('a', 'href',
|
||||
r'(http://ahoipolloi\.blogger\.de/\?day=\d+)'))
|
||||
imageSearch = '//img[contains(@src, "/static/antville/ahoipolloi/")]'
|
||||
prevSearch = '//a[contains(@href, "/?day=")]'
|
||||
help = 'Index format: yyyymmdd'
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
return imageUrl.rsplit('/', 1)[1]
|
||||
|
||||
|
||||
class AhoyEarth(_ParserScraper):
|
||||
url = 'http://www.ahoyearth.com/'
|
||||
|
|
|
@ -29,8 +29,8 @@ except ImportError:
|
|||
pycountry = None
|
||||
|
||||
from . import loader, configuration, languages
|
||||
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
|
||||
getDirname, unescape, tagre, normaliseURL,
|
||||
from .util import (get_page, makeSequence, get_system_uid, urlopen, getDirname,
|
||||
unescape, tagre, normaliseURL,
|
||||
prettyMatcherList, requests_session)
|
||||
from .comic import ComicStrip
|
||||
from .output import out
|
||||
|
@ -361,7 +361,7 @@ class _BasicScraper(Scraper):
|
|||
|
||||
@classmethod
|
||||
def getPage(cls, url):
|
||||
content = getPageContent(url, cls.session)
|
||||
content = get_page(url, cls.session).text
|
||||
# determine base URL
|
||||
baseUrl = None
|
||||
match = cls.BASE_SEARCH.search(content)
|
||||
|
@ -430,13 +430,27 @@ class _ParserScraper(Scraper):
|
|||
of the HTML element and returns that.
|
||||
"""
|
||||
|
||||
# Taken directly from LXML
|
||||
XML_DECL = re.compile(
|
||||
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
||||
|
||||
# Switch between CSS and XPath selectors for this class. Since CSS needs
|
||||
# another Python module, XPath is the default for now.
|
||||
css = False
|
||||
|
||||
@classmethod
|
||||
def getPage(cls, url):
|
||||
tree = html.document_fromstring(getPageContent(url, cls.session))
|
||||
page = get_page(url, cls.session)
|
||||
if page.encoding:
|
||||
# Requests figured out the encoding, so we can deliver Unicode to
|
||||
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML
|
||||
# declaration with (probably wrong!) encoding at the top of the
|
||||
# document. Web browsers ignore such if the encoding was specified
|
||||
# in the HTTP header and so do we.
|
||||
text = cls.XML_DECL.sub('\1\2', page.text, count=1)
|
||||
tree = html.document_fromstring(text)
|
||||
else:
|
||||
tree = html.document_fromstring(page.content)
|
||||
tree.make_links_absolute(url)
|
||||
return tree
|
||||
|
||||
|
|
|
@ -185,14 +185,13 @@ def case_insensitive_re(name):
|
|||
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
||||
|
||||
|
||||
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
||||
def get_page(url, session, max_content_bytes=MaxContentBytes):
|
||||
"""Get text content of given URL."""
|
||||
check_robotstxt(url, session)
|
||||
# read page data
|
||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||
data = page.text
|
||||
out.debug(u"Got page content %r" % data, level=3)
|
||||
return data
|
||||
out.debug(u"Got page content %r" % page.content, level=3)
|
||||
return page
|
||||
|
||||
|
||||
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
||||
|
@ -437,7 +436,7 @@ def strtimezone():
|
|||
zone = time.altzone
|
||||
else:
|
||||
zone = time.timezone
|
||||
return "%+04d" % (-zone//3600)
|
||||
return "%+04d" % (-zone // 3600)
|
||||
|
||||
|
||||
def rfc822date(indate):
|
||||
|
@ -477,12 +476,12 @@ def strsize(b):
|
|||
if b < 1024 * 1024:
|
||||
return "%.2fKB" % (float(b) / 1024)
|
||||
if b < 1024 * 1024 * 10:
|
||||
return "%.2fMB" % (float(b) / (1024*1024))
|
||||
return "%.2fMB" % (float(b) / (1024 * 1024))
|
||||
if b < 1024 * 1024 * 1024:
|
||||
return "%.1fMB" % (float(b) / (1024*1024))
|
||||
return "%.1fMB" % (float(b) / (1024 * 1024))
|
||||
if b < 1024 * 1024 * 1024 * 10:
|
||||
return "%.2fGB" % (float(b) / (1024*1024*1024))
|
||||
return "%.1fGB" % (float(b) / (1024*1024*1024))
|
||||
return "%.2fGB" % (float(b) / (1024 * 1024 * 1024))
|
||||
return "%.1fGB" % (float(b) / (1024 * 1024 * 1024))
|
||||
|
||||
|
||||
def getDirname(name):
|
||||
|
|
Loading…
Reference in a new issue