Let urllib3 handle all retries.

2016-03-13 21:27:31 +01:00 · 2016-03-13 21:27:31 +01:00 · c4fcd985dd
commit c4fcd985dd
parent 78e13962f9
2 changed files with 48 additions and 41 deletions
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -3,7 +3,6 @@
 # Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2014-2016 Tobias Gruetzmacher

-import requests
 import time
 import random
 import os
@ -32,7 +31,7 @@ except ImportError:
 from . import loader, configuration, languages
 from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
                   getDirname, unescape, tagre, normaliseURL,
-                   prettyMatcherList)
+                   prettyMatcherList, requests_session)
 from .comic import ComicStrip
 from .output import out
 from .events import getHandler
@ -88,8 +87,8 @@ class Scraper(object):
    # usually the index format help
    help = ''

-    # HTTP session storing cookies
-    session = requests.session()
+    # HTTP session for configuration & cookies
+    session = requests_session()

    def __init__(self, indexes=None):
        """Initialize internal variables."""
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -1,6 +1,8 @@
-# -*- coding: iso-8859-1 -*-
+# -*- coding: utf-8 -*-
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2014-2016 Tobias Gruetzmacher
+
 from __future__ import division, print_function
 try:
    from urllib.parse import quote as url_quote, unquote as url_unquote
@ -15,6 +17,11 @@ try:
 except ImportError:
    import robotparser
 import requests
+from requests.adapters import HTTPAdapter
+try:
+    from urllib3.util.retry import Retry
+except ImportError:
+    from requests.packages.urllib3.util.retry import Retry
 import sys
 import os
 import cgi
@ -32,16 +39,17 @@ from .output import out
 from .configuration import UserAgent, AppName, App, SupportUrl

 # Maximum content size for HTML pages
-MaxContentBytes = 1024 * 1024 * 3 # 2 MB
+MaxContentBytes = 1024 * 1024 * 3  # 3 MB

 # Maximum content size for images
-MaxImageBytes = 1024 * 1024 * 20 # 20 MB
+MaxImageBytes = 1024 * 1024 * 20  # 20 MB

 # Default number of retries
 MaxRetries = 3

-# Time to pause between retries
-RetryPauseSeconds = 5
+# Factor for retry backoff (see urllib3.util.retry, this default means
+# 2s, 4s, 8s)
+RetryBackoffFactor = 2

 # Default connection timeout
 ConnectionTimeoutSecs = 60
@ -55,6 +63,14 @@ ConnectionTimeoutSecs = 60
 UrlEncoding = "utf-8"


+def requests_session():
+    s = requests.Session()
+    retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
+    s.mount('http://', HTTPAdapter(max_retries=retry))
+    s.mount('https://', HTTPAdapter(max_retries=retry))
+    return s
+
+
 def get_system_uid():
    """Get a (probably) unique ID to identify a system.
    Used to differentiate votes.
@ -107,7 +123,7 @@ def get_mac_uid():
    return "%d" % uuid.getnode()


-def backtick (cmd, encoding='utf-8'):
+def backtick(cmd, encoding='utf-8'):
    """Return decoded output from command."""
    data = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0]
    return data.decode(encoding)
@ -155,7 +171,9 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
        prefix=prefix,
        after=after,
    )
-    return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
+    return (r'<\s*%(tag)s\s+%(prefix)s' +
+            r'%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)' +
+            r's[^>]*%(after)s[^>]*>') % attrs


 def case_insensitive_re(name):
@ -170,37 +188,20 @@ def case_insensitive_re(name):
    return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)


-def isValidPageContent(data):
-    """Check if page content is empty or has error messages."""
-    # The python requests library sometimes returns empty data.
-    # Some webservers have a 200 OK status but have an error message as response.
-    return data and not data.startswith("Internal Server Error")
-
-
 def getPageContent(url, session, max_content_bytes=MaxContentBytes):
    """Get text content of given URL."""
    check_robotstxt(url, session)
    # read page data
-    try:
-        page = urlopen(url, session, max_content_bytes=max_content_bytes)
-    except IOError:
-        page = urlopen(url, session, max_content_bytes=max_content_bytes)
+    page = urlopen(url, session, max_content_bytes=max_content_bytes)
    data = page.text
-    tries = MaxRetries
-    while not isValidPageContent(data) and tries > 0:
-        time.sleep(RetryPauseSeconds)
-        page = urlopen(url, session, max_content_bytes=max_content_bytes)
-        data = page.text
-        tries -= 1
-    if not isValidPageContent(data):
-        raise ValueError("Got invalid page content from %s: %r" % (url, data))
    out.debug(u"Got page content %r" % data, level=3)
    return data


 def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
    """Get response object for given image URL."""
-    return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes, stream=True)
+    return urlopen(url, session, referrer=referrer,
+                   max_content_bytes=max_content_bytes, stream=True)


 def makeSequence(item):
@ -224,6 +225,8 @@ def prettyMatcherList(things):


 _htmlparser = HTMLParser()
+
+
 def unescape(text):
    """Replace HTML entities and character references."""
    return _htmlparser.unescape(text)
@ -231,6 +234,7 @@ def unescape(text):

 _nopathquote_chars = "-;/=,~*+()@!"

+
 def normaliseURL(url):
    """Normalising
    - strips and leading or trailing whitespace,
@ -275,7 +279,8 @@ def get_robotstxt_parser(url, session=None):
    """Get a RobotFileParser for the given robots.txt URL."""
    rp = robotparser.RobotFileParser()
    try:
-        req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
+        req = urlopen(url, session, max_content_bytes=MaxContentBytes,
+                      raise_for_status=False)
    except Exception:
        # connect or timeout errors are treated as an absent robots.txt
        rp.allow_all = True
@ -329,8 +334,9 @@ def check_content_size(url, headers, max_content_bytes):
    if 'content-length' in headers:
        size = int(headers['content-length'])
        if size > max_content_bytes:
-            msg = 'URL content of %s with %d bytes exceeds %d bytes.' % (url, size, max_content_bytes)
-            raise IOError(msg)
+            raise IOError(
+                'URL content of %s with %d bytes exceeds %d bytes.' %
+                (url, size, max_content_bytes))


 def splitpath(path):
@ -388,7 +394,8 @@ I can work with ;) .
    print_proxy_info(out=out)
    print_locale_info(out=out)
    print(os.linesep,
-            "******** %s internal error, over and out ********" % AppName, file=out)
+          "******** %s internal error, over and out ********" % AppName,
+          file=out)


 def print_env_info(key, out=sys.stderr):
@ -414,7 +421,7 @@ def print_app_info(out=sys.stderr):
    print("System info:", file=out)
    print(App, file=out)
    print("Python %(version)s on %(platform)s" %
-                    {"version": sys.version, "platform": sys.platform}, file=out)
+          {"version": sys.version, "platform": sys.platform}, file=out)
    stime = strtime(time.time())
    print("Local time:", stime, file=out)
    print("sys.argv", sys.argv, file=out)
@ -422,8 +429,8 @@ def print_app_info(out=sys.stderr):

 def strtime(t):
    """Return ISO 8601 formatted time."""
-    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
-           strtimezone()
+    return (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) +
+            strtimezone())


 def strtimezone():
@ -461,7 +468,7 @@ def quote(text, safechars='/'):
    return url_quote(text, safechars)


-def strsize (b):
+def strsize(b):
    """Return human representation of bytes b. A negative number of bytes
    raises a value error."""
    if b < 0:
@ -487,7 +494,8 @@ def getDirname(name):


 def getFilename(name):
-    """Get a filename from given name without dangerous or incompatible characters."""
+    """Get a filename from given name without dangerous or incompatible
+    characters."""
    # first replace all illegal chars
    name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
    # then remove double dots and underscores
@ -532,7 +540,7 @@ def getNonexistingFile(name):
    return filename


-def strlimit (s, length=72):
+def strlimit(s, length=72):
    """If the length of the string exceeds the given limit, it will be cut
    off and three dots will be appended.