Pass unicode strings to lxml.

This reverts commit fcde86e9c0 & some more. This lets python-requests do all the encoding stuff and leaves LXML with (hopefully) clean unicode HTML to parse.
2014-10-13 19:39:48 +02:00 · 2014-10-13 19:39:48 +02:00 · 3235b8b312
commit 3235b8b312
parent e87f5993b8
2 changed files with 11 additions and 11 deletions
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -406,7 +406,7 @@ class _ParserScraper(Scraper):
        from lxml.html.defs import link_attrs
        cls.link_attrs = link_attrs
        cls.html = html
-        tree = html.document_fromstring(getPageContent(url, cls.session, raw_data=True))
+        tree = html.document_fromstring(getPageContent(url, cls.session))
        tree.make_links_absolute(url)
        return tree
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -180,28 +180,28 @@ def isValidPageContent(data):
    """Check if page content is empty or has error messages."""
    # The python requests library sometimes returns empty data.
    # Some webservers have a 200 OK status but have an error message as response.
-    return data and not data.startswith(b"Internal Server Error")
+    return data and not data.startswith("Internal Server Error")
-def getPageContent(url, session, max_content_bytes=MaxContentBytes, raw_data=False):
+def getPageContent(url, session, max_content_bytes=MaxContentBytes):
-    """Get text content of given URL. If raw_data is False we try hard not to
+    """Get text content of given URL."""
    decode the page content before returning (We work on page.content instead
    of page.text most of the time)."""
    check_robotstxt(url, session)
    # read page data
    try:
        page = urlopen(url, session, max_content_bytes=max_content_bytes)
    except IOError:
        page = urlopen(url, session, max_content_bytes=max_content_bytes)
    data = page.text
    tries = MaxRetries
-    while not isValidPageContent(page.content) and tries > 0:
+    while not isValidPageContent(data) and tries > 0:
        time.sleep(RetryPauseSeconds)
        page = urlopen(url, session, max_content_bytes=max_content_bytes)
        data = page.text
        tries -= 1
-    if not isValidPageContent(page.content):
+    if not isValidPageContent(data):
-        raise ValueError("Got invalid page content from %s: %r" % (url, page.text))
+        raise ValueError("Got invalid page content from %s: %r" % (url, data))
-    if out.level >= 3: out.debug(u"Got page content %r" % page.text, level=3)
+    out.debug(u"Got page content %r" % data, level=3)
-    return page.content if raw_data else page.text
+    return data
 def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):