From c4fcd985dd1903796cbf9604f04d404443feda4a Mon Sep 17 00:00:00 2001
From: Tobias Gruetzmacher <tobias-git@23.gs>
Date: Sun, 13 Mar 2016 21:27:31 +0100
Subject: [PATCH] Let urllib3 handle all retries.

---
 dosagelib/scraper.py |  7 ++--
 dosagelib/util.py    | 82 ++++++++++++++++++++++++--------------------
 2 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py
index f691b7709..171d13e62 100644
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@@ -3,7 +3,6 @@
 # Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2014-2016 Tobias Gruetzmacher
 
-import requests
 import time
 import random
 import os
@@ -32,7 +31,7 @@ except ImportError:
 from . import loader, configuration, languages
 from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
                    getDirname, unescape, tagre, normaliseURL,
-                   prettyMatcherList)
+                   prettyMatcherList, requests_session)
 from .comic import ComicStrip
 from .output import out
 from .events import getHandler
@@ -88,8 +87,8 @@ class Scraper(object):
     # usually the index format help
     help = ''
 
-    # HTTP session storing cookies
-    session = requests.session()
+    # HTTP session for configuration & cookies
+    session = requests_session()
 
     def __init__(self, indexes=None):
         """Initialize internal variables."""
diff --git a/dosagelib/util.py b/dosagelib/util.py
index 48f8f303e..4f89bdea3 100644
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@@ -1,6 +1,8 @@
-# -*- coding: iso-8859-1 -*-
+# -*- coding: utf-8 -*-
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2014-2016 Tobias Gruetzmacher
+
 from __future__ import division, print_function
 try:
     from urllib.parse import quote as url_quote, unquote as url_unquote
@@ -15,6 +17,11 @@ try:
 except ImportError:
     import robotparser
 import requests
+from requests.adapters import HTTPAdapter
+try:
+    from urllib3.util.retry import Retry
+except ImportError:
+    from requests.packages.urllib3.util.retry import Retry
 import sys
 import os
 import cgi
@@ -32,16 +39,17 @@ from .output import out
 from .configuration import UserAgent, AppName, App, SupportUrl
 
 # Maximum content size for HTML pages
-MaxContentBytes = 1024 * 1024 * 3 # 2 MB
+MaxContentBytes = 1024 * 1024 * 3  # 3 MB
 
 # Maximum content size for images
-MaxImageBytes = 1024 * 1024 * 20 # 20 MB
+MaxImageBytes = 1024 * 1024 * 20  # 20 MB
 
 # Default number of retries
 MaxRetries = 3
 
-# Time to pause between retries
-RetryPauseSeconds = 5
+# Factor for retry backoff (see urllib3.util.retry, this default means
+# 2s, 4s, 8s)
+RetryBackoffFactor = 2
 
 # Default connection timeout
 ConnectionTimeoutSecs = 60
@@ -55,6 +63,14 @@ ConnectionTimeoutSecs = 60
 UrlEncoding = "utf-8"
 
 
+def requests_session():
+    s = requests.Session()
+    retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
+    s.mount('http://', HTTPAdapter(max_retries=retry))
+    s.mount('https://', HTTPAdapter(max_retries=retry))
+    return s
+
+
 def get_system_uid():
     """Get a (probably) unique ID to identify a system.
     Used to differentiate votes.
@@ -107,7 +123,7 @@ def get_mac_uid():
     return "%d" % uuid.getnode()
 
 
-def backtick (cmd, encoding='utf-8'):
+def backtick(cmd, encoding='utf-8'):
     """Return decoded output from command."""
     data = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0]
     return data.decode(encoding)
@@ -155,7 +171,9 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
         prefix=prefix,
         after=after,
     )
-    return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
+    return (r'<\s*%(tag)s\s+%(prefix)s' +
+            r'%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)' +
+            r's[^>]*%(after)s[^>]*>') % attrs
 
 
 def case_insensitive_re(name):
@@ -170,37 +188,20 @@ def case_insensitive_re(name):
     return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
 
 
-def isValidPageContent(data):
-    """Check if page content is empty or has error messages."""
-    # The python requests library sometimes returns empty data.
-    # Some webservers have a 200 OK status but have an error message as response.
-    return data and not data.startswith("Internal Server Error")
-
-
 def getPageContent(url, session, max_content_bytes=MaxContentBytes):
     """Get text content of given URL."""
     check_robotstxt(url, session)
     # read page data
-    try:
-        page = urlopen(url, session, max_content_bytes=max_content_bytes)
-    except IOError:
-        page = urlopen(url, session, max_content_bytes=max_content_bytes)
+    page = urlopen(url, session, max_content_bytes=max_content_bytes)
     data = page.text
-    tries = MaxRetries
-    while not isValidPageContent(data) and tries > 0:
-        time.sleep(RetryPauseSeconds)
-        page = urlopen(url, session, max_content_bytes=max_content_bytes)
-        data = page.text
-        tries -= 1
-    if not isValidPageContent(data):
-        raise ValueError("Got invalid page content from %s: %r" % (url, data))
     out.debug(u"Got page content %r" % data, level=3)
     return data
 
 
 def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
     """Get response object for given image URL."""
-    return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes, stream=True)
+    return urlopen(url, session, referrer=referrer,
+                   max_content_bytes=max_content_bytes, stream=True)
 
 
 def makeSequence(item):
@@ -224,6 +225,8 @@ def prettyMatcherList(things):
 
 
 _htmlparser = HTMLParser()
+
+
 def unescape(text):
     """Replace HTML entities and character references."""
     return _htmlparser.unescape(text)
@@ -231,6 +234,7 @@ def unescape(text):
 
 _nopathquote_chars = "-;/=,~*+()@!"
 
+
 def normaliseURL(url):
     """Normalising
     - strips and leading or trailing whitespace,
@@ -275,7 +279,8 @@ def get_robotstxt_parser(url, session=None):
     """Get a RobotFileParser for the given robots.txt URL."""
     rp = robotparser.RobotFileParser()
     try:
-        req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
+        req = urlopen(url, session, max_content_bytes=MaxContentBytes,
+                      raise_for_status=False)
     except Exception:
         # connect or timeout errors are treated as an absent robots.txt
         rp.allow_all = True
@@ -329,8 +334,9 @@ def check_content_size(url, headers, max_content_bytes):
     if 'content-length' in headers:
         size = int(headers['content-length'])
         if size > max_content_bytes:
-            msg = 'URL content of %s with %d bytes exceeds %d bytes.' % (url, size, max_content_bytes)
-            raise IOError(msg)
+            raise IOError(
+                'URL content of %s with %d bytes exceeds %d bytes.' %
+                (url, size, max_content_bytes))
 
 
 def splitpath(path):
@@ -388,7 +394,8 @@ I can work with ;) .
     print_proxy_info(out=out)
     print_locale_info(out=out)
     print(os.linesep,
-            "******** %s internal error, over and out ********" % AppName, file=out)
+          "******** %s internal error, over and out ********" % AppName,
+          file=out)
 
 
 def print_env_info(key, out=sys.stderr):
@@ -414,7 +421,7 @@ def print_app_info(out=sys.stderr):
     print("System info:", file=out)
     print(App, file=out)
     print("Python %(version)s on %(platform)s" %
-                    {"version": sys.version, "platform": sys.platform}, file=out)
+          {"version": sys.version, "platform": sys.platform}, file=out)
     stime = strtime(time.time())
     print("Local time:", stime, file=out)
     print("sys.argv", sys.argv, file=out)
@@ -422,8 +429,8 @@ def print_app_info(out=sys.stderr):
 
 def strtime(t):
     """Return ISO 8601 formatted time."""
-    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
-           strtimezone()
+    return (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) +
+            strtimezone())
 
 
 def strtimezone():
@@ -461,7 +468,7 @@ def quote(text, safechars='/'):
     return url_quote(text, safechars)
 
 
-def strsize (b):
+def strsize(b):
     """Return human representation of bytes b. A negative number of bytes
     raises a value error."""
     if b < 0:
@@ -487,7 +494,8 @@ def getDirname(name):
 
 
 def getFilename(name):
-    """Get a filename from given name without dangerous or incompatible characters."""
+    """Get a filename from given name without dangerous or incompatible
+    characters."""
     # first replace all illegal chars
     name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
     # then remove double dots and underscores
@@ -532,7 +540,7 @@ def getNonexistingFile(name):
     return filename
 
 
-def strlimit (s, length=72):
+def strlimit(s, length=72):
     """If the length of the string exceeds the given limit, it will be cut
     off and three dots will be appended.