Pass unicode strings to lxml.

This reverts commit fcde86e9c0 & some
more. This lets python-requests do all the encoding stuff and leaves
LXML with (hopefully) clean unicode HTML to parse.
This commit is contained in:
Tobias Gruetzmacher 2014-10-13 19:39:48 +02:00
parent e87f5993b8
commit 3235b8b312
2 changed files with 11 additions and 11 deletions

View file

@ -406,7 +406,7 @@ class _ParserScraper(Scraper):
from lxml.html.defs import link_attrs from lxml.html.defs import link_attrs
cls.link_attrs = link_attrs cls.link_attrs = link_attrs
cls.html = html cls.html = html
tree = html.document_fromstring(getPageContent(url, cls.session, raw_data=True)) tree = html.document_fromstring(getPageContent(url, cls.session))
tree.make_links_absolute(url) tree.make_links_absolute(url)
return tree return tree

View file

@ -180,28 +180,28 @@ def isValidPageContent(data):
"""Check if page content is empty or has error messages.""" """Check if page content is empty or has error messages."""
# The python requests library sometimes returns empty data. # The python requests library sometimes returns empty data.
# Some webservers have a 200 OK status but have an error message as response. # Some webservers have a 200 OK status but have an error message as response.
return data and not data.startswith(b"Internal Server Error") return data and not data.startswith("Internal Server Error")
def getPageContent(url, session, max_content_bytes=MaxContentBytes, raw_data=False): def getPageContent(url, session, max_content_bytes=MaxContentBytes):
"""Get text content of given URL. If raw_data is False we try hard not to """Get text content of given URL."""
decode the page content before returning (We work on page.content instead
of page.text most of the time)."""
check_robotstxt(url, session) check_robotstxt(url, session)
# read page data # read page data
try: try:
page = urlopen(url, session, max_content_bytes=max_content_bytes) page = urlopen(url, session, max_content_bytes=max_content_bytes)
except IOError: except IOError:
page = urlopen(url, session, max_content_bytes=max_content_bytes) page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text
tries = MaxRetries tries = MaxRetries
while not isValidPageContent(page.content) and tries > 0: while not isValidPageContent(data) and tries > 0:
time.sleep(RetryPauseSeconds) time.sleep(RetryPauseSeconds)
page = urlopen(url, session, max_content_bytes=max_content_bytes) page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text
tries -= 1 tries -= 1
if not isValidPageContent(page.content): if not isValidPageContent(data):
raise ValueError("Got invalid page content from %s: %r" % (url, page.text)) raise ValueError("Got invalid page content from %s: %r" % (url, data))
if out.level >= 3: out.debug(u"Got page content %r" % page.text, level=3) out.debug(u"Got page content %r" % data, level=3)
return page.content if raw_data else page.text return data
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes): def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):