Recognize internal server errors.
This commit is contained in:
parent
6ba226d2e2
commit
96bf9ef523
1 changed files with 16 additions and 7 deletions
|
@ -30,6 +30,9 @@ MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
||||||
# Default number of retries
|
# Default number of retries
|
||||||
MaxRetries = 3
|
MaxRetries = 3
|
||||||
|
|
||||||
|
# Time to pause between retries
|
||||||
|
RetryPauseSeconds = 5
|
||||||
|
|
||||||
# Default connection timeout
|
# Default connection timeout
|
||||||
ConnectionTimeoutSecs = 60
|
ConnectionTimeoutSecs = 60
|
||||||
|
|
||||||
|
@ -95,21 +98,27 @@ def case_insensitive_re(name):
|
||||||
|
|
||||||
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
||||||
|
|
||||||
|
def isValidPageContent(data):
|
||||||
|
"""Check if page content is empty or has error messages."""
|
||||||
|
# The python requests library sometimes returns empty data.
|
||||||
|
# Some webservers have a 200 OK status but have an error message as response.
|
||||||
|
return data and not data.startswith("Internal Server Error")
|
||||||
|
|
||||||
|
|
||||||
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
||||||
"""Get text content of given URL."""
|
"""Get text content of given URL."""
|
||||||
check_robotstxt(url, session)
|
check_robotstxt(url, session)
|
||||||
# read page data
|
# read page data
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||||
data = page.text
|
data = page.text
|
||||||
tries = 0
|
tries = MaxRetries
|
||||||
while not data and tries < 5:
|
while not isValidPageContent(data) and tries > 0:
|
||||||
# sometimes the python requests library is wonky - try again
|
time.sleep(RetryPauseSeconds)
|
||||||
time.sleep(5)
|
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||||
data = page.text
|
data = page.text
|
||||||
tries += 1
|
tries -= 1
|
||||||
if not data:
|
if not isValidPageContent(data):
|
||||||
raise ValueError("Got empty data from %s" % url)
|
raise ValueError("Got invalid page content from %s: %r" % (url, data))
|
||||||
# determine base URL
|
# determine base URL
|
||||||
baseUrl = None
|
baseUrl = None
|
||||||
match = baseSearch.search(data)
|
match = baseSearch.search(data)
|
||||||
|
|
Loading…
Reference in a new issue