Pass unicode strings to lxml.
This reverts commit fcde86e9c0
& some
more. This lets python-requests do all the encoding stuff and leaves
LXML with (hopefully) clean unicode HTML to parse.
This commit is contained in:
parent
e87f5993b8
commit
3235b8b312
2 changed files with 11 additions and 11 deletions
|
@ -406,7 +406,7 @@ class _ParserScraper(Scraper):
|
||||||
from lxml.html.defs import link_attrs
|
from lxml.html.defs import link_attrs
|
||||||
cls.link_attrs = link_attrs
|
cls.link_attrs = link_attrs
|
||||||
cls.html = html
|
cls.html = html
|
||||||
tree = html.document_fromstring(getPageContent(url, cls.session, raw_data=True))
|
tree = html.document_fromstring(getPageContent(url, cls.session))
|
||||||
tree.make_links_absolute(url)
|
tree.make_links_absolute(url)
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
|
|
|
@ -180,28 +180,28 @@ def isValidPageContent(data):
|
||||||
"""Check if page content is empty or has error messages."""
|
"""Check if page content is empty or has error messages."""
|
||||||
# The python requests library sometimes returns empty data.
|
# The python requests library sometimes returns empty data.
|
||||||
# Some webservers have a 200 OK status but have an error message as response.
|
# Some webservers have a 200 OK status but have an error message as response.
|
||||||
return data and not data.startswith(b"Internal Server Error")
|
return data and not data.startswith("Internal Server Error")
|
||||||
|
|
||||||
|
|
||||||
def getPageContent(url, session, max_content_bytes=MaxContentBytes, raw_data=False):
|
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
||||||
"""Get text content of given URL. If raw_data is False we try hard not to
|
"""Get text content of given URL."""
|
||||||
decode the page content before returning (We work on page.content instead
|
|
||||||
of page.text most of the time)."""
|
|
||||||
check_robotstxt(url, session)
|
check_robotstxt(url, session)
|
||||||
# read page data
|
# read page data
|
||||||
try:
|
try:
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||||
except IOError:
|
except IOError:
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||||
|
data = page.text
|
||||||
tries = MaxRetries
|
tries = MaxRetries
|
||||||
while not isValidPageContent(page.content) and tries > 0:
|
while not isValidPageContent(data) and tries > 0:
|
||||||
time.sleep(RetryPauseSeconds)
|
time.sleep(RetryPauseSeconds)
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||||
|
data = page.text
|
||||||
tries -= 1
|
tries -= 1
|
||||||
if not isValidPageContent(page.content):
|
if not isValidPageContent(data):
|
||||||
raise ValueError("Got invalid page content from %s: %r" % (url, page.text))
|
raise ValueError("Got invalid page content from %s: %r" % (url, data))
|
||||||
if out.level >= 3: out.debug(u"Got page content %r" % page.text, level=3)
|
out.debug(u"Got page content %r" % data, level=3)
|
||||||
return page.content if raw_data else page.text
|
return data
|
||||||
|
|
||||||
|
|
||||||
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
||||||
|
|
Loading…
Reference in a new issue