Change getPageContent to (optionally) return raw text.

This allows LXML to do its own "magic" encoding detection
This commit is contained in:
Tobias Gruetzmacher 2014-07-23 20:54:00 +02:00
parent 0e03eca8f0
commit fcde86e9c0

View file

@ -180,28 +180,28 @@ def isValidPageContent(data):
"""Check if page content is empty or has error messages.""" """Check if page content is empty or has error messages."""
# The python requests library sometimes returns empty data. # The python requests library sometimes returns empty data.
# Some webservers have a 200 OK status but have an error message as response. # Some webservers have a 200 OK status but have an error message as response.
return data and not data.startswith("Internal Server Error") return data and not data.startswith(b"Internal Server Error")
def getPageContent(url, session, max_content_bytes=MaxContentBytes): def getPageContent(url, session, max_content_bytes=MaxContentBytes, raw_data=False):
"""Get text content of given URL.""" """Get text content of given URL. If raw_data is False we try hard not to
decode the page content before returning (We work on page.content instead
of page.text most of the time)."""
check_robotstxt(url, session) check_robotstxt(url, session)
# read page data # read page data
try: try:
page = urlopen(url, session, max_content_bytes=max_content_bytes) page = urlopen(url, session, max_content_bytes=max_content_bytes)
except IOError: except IOError:
page = urlopen(url, session, max_content_bytes=max_content_bytes) page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text
tries = MaxRetries tries = MaxRetries
while not isValidPageContent(data) and tries > 0: while not isValidPageContent(page.content) and tries > 0:
time.sleep(RetryPauseSeconds) time.sleep(RetryPauseSeconds)
page = urlopen(url, session, max_content_bytes=max_content_bytes) page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text
tries -= 1 tries -= 1
if not isValidPageContent(data): if not isValidPageContent(page.content):
raise ValueError("Got invalid page content from %s: %r" % (url, data)) raise ValueError("Got invalid page content from %s: %r" % (url, page.text))
out.debug(u"Got page content %r" % data, level=3) if out.level >= 3: out.debug(u"Got page content %r" % page.text, level=3)
return data return page.content if raw_data else page.text
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes): def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):