Change getPageContent to (optionally) return raw text.

This allows LXML to do its own "magic" encoding detection
2014-07-23 20:54:00 +02:00 · 2014-07-23 20:54:00 +02:00 · fcde86e9c0
commit fcde86e9c0
parent 0e03eca8f0
1 changed files with 10 additions and 10 deletions
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -180,28 +180,28 @@ def isValidPageContent(data):
    """Check if page content is empty or has error messages."""
    # The python requests library sometimes returns empty data.
    # Some webservers have a 200 OK status but have an error message as response.
-    return data and not data.startswith("Internal Server Error")
+    return data and not data.startswith(b"Internal Server Error")


-def getPageContent(url, session, max_content_bytes=MaxContentBytes):
-    """Get text content of given URL."""
+def getPageContent(url, session, max_content_bytes=MaxContentBytes, raw_data=False):
+    """Get text content of given URL. If raw_data is False we try hard not to
+    decode the page content before returning (We work on page.content instead
+    of page.text most of the time)."""
    check_robotstxt(url, session)
    # read page data
    try:
        page = urlopen(url, session, max_content_bytes=max_content_bytes)
    except IOError:
        page = urlopen(url, session, max_content_bytes=max_content_bytes)
-    data = page.text
    tries = MaxRetries
-    while not isValidPageContent(data) and tries > 0:
+    while not isValidPageContent(page.content) and tries > 0:
        time.sleep(RetryPauseSeconds)
        page = urlopen(url, session, max_content_bytes=max_content_bytes)
-        data = page.text
        tries -= 1
-    if not isValidPageContent(data):
-        raise ValueError("Got invalid page content from %s: %r" % (url, data))
-    out.debug(u"Got page content %r" % data, level=3)
-    return data
+    if not isValidPageContent(page.content):
+        raise ValueError("Got invalid page content from %s: %r" % (url, page.text))
+    if out.level >= 3: out.debug(u"Got page content %r" % page.text, level=3)
+    return page.content if raw_data else page.text


 def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):