From 96bf9ef52303fc278c47c7775f98f62d2079db2a Mon Sep 17 00:00:00 2001
From: Bastian Kleineidam <bastian.kleineidam@web.de>
Date: Wed, 13 Feb 2013 17:54:10 +0100
Subject: [PATCH] Recognize internal server errors.

---
 dosagelib/util.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/dosagelib/util.py b/dosagelib/util.py
index 573b0e6ae..deec5e8c9 100644
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@@ -30,6 +30,9 @@ MaxImageBytes = 1024 * 1024 * 20 # 20 MB
 # Default number of retries
 MaxRetries = 3
 
+# Time to pause between retries
+RetryPauseSeconds = 5
+
 # Default connection timeout
 ConnectionTimeoutSecs = 60
 
@@ -95,21 +98,27 @@ def case_insensitive_re(name):
 
 baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
 
+def isValidPageContent(data):
+    """Check if page content is empty or has error messages."""
+    # The python requests library sometimes returns empty data.
+    # Some webservers have a 200 OK status but have an error message as response.
+    return data and not data.startswith("Internal Server Error")
+
+
 def getPageContent(url, session, max_content_bytes=MaxContentBytes):
     """Get text content of given URL."""
     check_robotstxt(url, session)
     # read page data
     page = urlopen(url, session, max_content_bytes=max_content_bytes)
     data = page.text
-    tries = 0
-    while not data and tries < 5:
-        # sometimes the python requests library is wonky - try again
-        time.sleep(5)
+    tries = MaxRetries
+    while not isValidPageContent(data) and tries > 0:
+        time.sleep(RetryPauseSeconds)
         page = urlopen(url, session, max_content_bytes=max_content_bytes)
         data = page.text
-        tries += 1
-    if not data:
-        raise ValueError("Got empty data from %s" % url)
+        tries -= 1
+    if not isValidPageContent(data):
+        raise ValueError("Got invalid page content from %s: %r" % (url, data))
     # determine base URL
     baseUrl = None
     match = baseSearch.search(data)