Recognize internal server errors.

2013-02-13 17:54:10 +01:00 · 2013-02-13 17:54:10 +01:00 · 96bf9ef523
commit 96bf9ef523
parent 6ba226d2e2
1 changed files with 16 additions and 7 deletions
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -30,6 +30,9 @@ MaxImageBytes = 1024 * 1024 * 20 # 20 MB
 # Default number of retries
 MaxRetries = 3
 # Time to pause between retries
 RetryPauseSeconds = 5
 # Default connection timeout
 ConnectionTimeoutSecs = 60
@ -95,21 +98,27 @@ def case_insensitive_re(name):
 baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
 def isValidPageContent(data):
    """Check if page content is empty or has error messages."""
    # The python requests library sometimes returns empty data.
    # Some webservers have a 200 OK status but have an error message as response.
    return data and not data.startswith("Internal Server Error")
 def getPageContent(url, session, max_content_bytes=MaxContentBytes):
    """Get text content of given URL."""
    check_robotstxt(url, session)
    # read page data
    page = urlopen(url, session, max_content_bytes=max_content_bytes)
    data = page.text
-    tries = 0
+    tries = MaxRetries
-    while not data and tries < 5:
+    while not isValidPageContent(data) and tries > 0:
-        # sometimes the python requests library is wonky - try again
+        time.sleep(RetryPauseSeconds)
        time.sleep(5)
        page = urlopen(url, session, max_content_bytes=max_content_bytes)
        data = page.text
-        tries += 1
+        tries -= 1
-    if not data:
+    if not isValidPageContent(data):
-        raise ValueError("Got empty data from %s" % url)
+        raise ValueError("Got invalid page content from %s: %r" % (url, data))
    # determine base URL
    baseUrl = None
    match = baseSearch.search(data)