Let urllib3 handle all retries.

2016-03-13 21:27:31 +01:00 · 2016-03-13 21:27:31 +01:00 · c4fcd985dd
commit c4fcd985dd
parent 78e13962f9
2 changed files with 48 additions and 41 deletions
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -3,7 +3,6 @@
 # Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2014-2016 Tobias Gruetzmacher
 import requests
 import time
 import random
 import os
@ -32,7 +31,7 @@ except ImportError:
 from . import loader, configuration, languages
 from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
                   getDirname, unescape, tagre, normaliseURL,
-                   prettyMatcherList)
+                   prettyMatcherList, requests_session)
 from .comic import ComicStrip
 from .output import out
 from .events import getHandler
@ -88,8 +87,8 @@ class Scraper(object):
    # usually the index format help
    help = ''
-    # HTTP session storing cookies
+    # HTTP session for configuration & cookies
-    session = requests.session()
+    session = requests_session()
    def __init__(self, indexes=None):
        """Initialize internal variables."""
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -1,6 +1,8 @@
-# -*- coding: iso-8859-1 -*-
+# -*- coding: utf-8 -*-
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2014-2016 Tobias Gruetzmacher
 from __future__ import division, print_function
 try:
    from urllib.parse import quote as url_quote, unquote as url_unquote
@ -15,6 +17,11 @@ try:
 except ImportError:
    import robotparser
 import requests
 from requests.adapters import HTTPAdapter
 try:
    from urllib3.util.retry import Retry
 except ImportError:
    from requests.packages.urllib3.util.retry import Retry
 import sys
 import os
 import cgi
@ -32,7 +39,7 @@ from .output import out
 from .configuration import UserAgent, AppName, App, SupportUrl
 # Maximum content size for HTML pages
-MaxContentBytes = 1024 * 1024 * 3 # 2 MB
+MaxContentBytes = 1024 * 1024 * 3  # 3 MB
 # Maximum content size for images
 MaxImageBytes = 1024 * 1024 * 20  # 20 MB
@ -40,8 +47,9 @@ MaxImageBytes = 1024 * 1024 * 20 # 20 MB
 # Default number of retries
 MaxRetries = 3
-# Time to pause between retries
+# Factor for retry backoff (see urllib3.util.retry, this default means
-RetryPauseSeconds = 5
+# 2s, 4s, 8s)
 RetryBackoffFactor = 2
 # Default connection timeout
 ConnectionTimeoutSecs = 60
@ -55,6 +63,14 @@ ConnectionTimeoutSecs = 60
 UrlEncoding = "utf-8"
 def requests_session():
    s = requests.Session()
    retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
    s.mount('http://', HTTPAdapter(max_retries=retry))
    s.mount('https://', HTTPAdapter(max_retries=retry))
    return s
 def get_system_uid():
    """Get a (probably) unique ID to identify a system.
    Used to differentiate votes.
@ -155,7 +171,9 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
        prefix=prefix,
        after=after,
    )
-    return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
+    return (r'<\s*%(tag)s\s+%(prefix)s' +
            r'%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)' +
            r's[^>]*%(after)s[^>]*>') % attrs
 def case_insensitive_re(name):
@ -170,37 +188,20 @@ def case_insensitive_re(name):
    return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
 def isValidPageContent(data):
    """Check if page content is empty or has error messages."""
    # The python requests library sometimes returns empty data.
    # Some webservers have a 200 OK status but have an error message as response.
    return data and not data.startswith("Internal Server Error")
 def getPageContent(url, session, max_content_bytes=MaxContentBytes):
    """Get text content of given URL."""
    check_robotstxt(url, session)
    # read page data
    try:
        page = urlopen(url, session, max_content_bytes=max_content_bytes)
    except IOError:
    page = urlopen(url, session, max_content_bytes=max_content_bytes)
    data = page.text
    tries = MaxRetries
    while not isValidPageContent(data) and tries > 0:
        time.sleep(RetryPauseSeconds)
        page = urlopen(url, session, max_content_bytes=max_content_bytes)
        data = page.text
        tries -= 1
    if not isValidPageContent(data):
        raise ValueError("Got invalid page content from %s: %r" % (url, data))
    out.debug(u"Got page content %r" % data, level=3)
    return data
 def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
    """Get response object for given image URL."""
-    return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes, stream=True)
+    return urlopen(url, session, referrer=referrer,
                   max_content_bytes=max_content_bytes, stream=True)
 def makeSequence(item):
@ -224,6 +225,8 @@ def prettyMatcherList(things):
 _htmlparser = HTMLParser()
 def unescape(text):
    """Replace HTML entities and character references."""
    return _htmlparser.unescape(text)
@ -231,6 +234,7 @@ def unescape(text):
 _nopathquote_chars = "-;/=,~*+()@!"
 def normaliseURL(url):
    """Normalising
    - strips and leading or trailing whitespace,
@ -275,7 +279,8 @@ def get_robotstxt_parser(url, session=None):
    """Get a RobotFileParser for the given robots.txt URL."""
    rp = robotparser.RobotFileParser()
    try:
-        req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
+        req = urlopen(url, session, max_content_bytes=MaxContentBytes,
                      raise_for_status=False)
    except Exception:
        # connect or timeout errors are treated as an absent robots.txt
        rp.allow_all = True
@ -329,8 +334,9 @@ def check_content_size(url, headers, max_content_bytes):
    if 'content-length' in headers:
        size = int(headers['content-length'])
        if size > max_content_bytes:
-            msg = 'URL content of %s with %d bytes exceeds %d bytes.' % (url, size, max_content_bytes)
+            raise IOError(
-            raise IOError(msg)
+                'URL content of %s with %d bytes exceeds %d bytes.' %
                (url, size, max_content_bytes))
 def splitpath(path):
@ -388,7 +394,8 @@ I can work with ;) .
    print_proxy_info(out=out)
    print_locale_info(out=out)
    print(os.linesep,
-            "******** %s internal error, over and out ********" % AppName, file=out)
+          "******** %s internal error, over and out ********" % AppName,
          file=out)
 def print_env_info(key, out=sys.stderr):
@ -422,8 +429,8 @@ def print_app_info(out=sys.stderr):
 def strtime(t):
    """Return ISO 8601 formatted time."""
-    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
+    return (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) +
-           strtimezone()
+            strtimezone())
 def strtimezone():
@ -487,7 +494,8 @@ def getDirname(name):
 def getFilename(name):
-    """Get a filename from given name without dangerous or incompatible characters."""
+    """Get a filename from given name without dangerous or incompatible
    characters."""
    # first replace all illegal chars
    name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
    # then remove double dots and underscores