From c4fcd985dd1903796cbf9604f04d404443feda4a Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 13 Mar 2016 21:27:31 +0100 Subject: [PATCH] Let urllib3 handle all retries. --- dosagelib/scraper.py | 7 ++-- dosagelib/util.py | 82 ++++++++++++++++++++++++-------------------- 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index f691b7709..171d13e62 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -3,7 +3,6 @@ # Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2014-2016 Tobias Gruetzmacher -import requests import time import random import os @@ -32,7 +31,7 @@ except ImportError: from . import loader, configuration, languages from .util import (getPageContent, makeSequence, get_system_uid, urlopen, getDirname, unescape, tagre, normaliseURL, - prettyMatcherList) + prettyMatcherList, requests_session) from .comic import ComicStrip from .output import out from .events import getHandler @@ -88,8 +87,8 @@ class Scraper(object): # usually the index format help help = '' - # HTTP session storing cookies - session = requests.session() + # HTTP session for configuration & cookies + session = requests_session() def __init__(self, indexes=None): """Initialize internal variables.""" diff --git a/dosagelib/util.py b/dosagelib/util.py index 48f8f303e..4f89bdea3 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -1,6 +1,8 @@ -# -*- coding: iso-8859-1 -*- +# -*- coding: utf-8 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2014-2016 Tobias Gruetzmacher + from __future__ import division, print_function try: from urllib.parse import quote as url_quote, unquote as url_unquote @@ -15,6 +17,11 @@ try: except ImportError: import robotparser import requests +from requests.adapters import HTTPAdapter +try: + from urllib3.util.retry import Retry +except ImportError: + from requests.packages.urllib3.util.retry import Retry import sys import os import cgi @@ -32,16 +39,17 @@ from .output import out from .configuration import UserAgent, AppName, App, SupportUrl # Maximum content size for HTML pages -MaxContentBytes = 1024 * 1024 * 3 # 2 MB +MaxContentBytes = 1024 * 1024 * 3 # 3 MB # Maximum content size for images -MaxImageBytes = 1024 * 1024 * 20 # 20 MB +MaxImageBytes = 1024 * 1024 * 20 # 20 MB # Default number of retries MaxRetries = 3 -# Time to pause between retries -RetryPauseSeconds = 5 +# Factor for retry backoff (see urllib3.util.retry, this default means +# 2s, 4s, 8s) +RetryBackoffFactor = 2 # Default connection timeout ConnectionTimeoutSecs = 60 @@ -55,6 +63,14 @@ ConnectionTimeoutSecs = 60 UrlEncoding = "utf-8" +def requests_session(): + s = requests.Session() + retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor) + s.mount('http://', HTTPAdapter(max_retries=retry)) + s.mount('https://', HTTPAdapter(max_retries=retry)) + return s + + def get_system_uid(): """Get a (probably) unique ID to identify a system. Used to differentiate votes. @@ -107,7 +123,7 @@ def get_mac_uid(): return "%d" % uuid.getnode() -def backtick (cmd, encoding='utf-8'): +def backtick(cmd, encoding='utf-8'): """Return decoded output from command.""" data = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0] return data.decode(encoding) @@ -155,7 +171,9 @@ def tagre(tag, attribute, value, quote='"', before="", after=""): prefix=prefix, after=after, ) - return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs + return (r'<\s*%(tag)s\s+%(prefix)s' + + r'%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)' + + r's[^>]*%(after)s[^>]*>') % attrs def case_insensitive_re(name): @@ -170,37 +188,20 @@ def case_insensitive_re(name): return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name) -def isValidPageContent(data): - """Check if page content is empty or has error messages.""" - # The python requests library sometimes returns empty data. - # Some webservers have a 200 OK status but have an error message as response. - return data and not data.startswith("Internal Server Error") - - def getPageContent(url, session, max_content_bytes=MaxContentBytes): """Get text content of given URL.""" check_robotstxt(url, session) # read page data - try: - page = urlopen(url, session, max_content_bytes=max_content_bytes) - except IOError: - page = urlopen(url, session, max_content_bytes=max_content_bytes) + page = urlopen(url, session, max_content_bytes=max_content_bytes) data = page.text - tries = MaxRetries - while not isValidPageContent(data) and tries > 0: - time.sleep(RetryPauseSeconds) - page = urlopen(url, session, max_content_bytes=max_content_bytes) - data = page.text - tries -= 1 - if not isValidPageContent(data): - raise ValueError("Got invalid page content from %s: %r" % (url, data)) out.debug(u"Got page content %r" % data, level=3) return data def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes): """Get response object for given image URL.""" - return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes, stream=True) + return urlopen(url, session, referrer=referrer, + max_content_bytes=max_content_bytes, stream=True) def makeSequence(item): @@ -224,6 +225,8 @@ def prettyMatcherList(things): _htmlparser = HTMLParser() + + def unescape(text): """Replace HTML entities and character references.""" return _htmlparser.unescape(text) @@ -231,6 +234,7 @@ def unescape(text): _nopathquote_chars = "-;/=,~*+()@!" + def normaliseURL(url): """Normalising - strips and leading or trailing whitespace, @@ -275,7 +279,8 @@ def get_robotstxt_parser(url, session=None): """Get a RobotFileParser for the given robots.txt URL.""" rp = robotparser.RobotFileParser() try: - req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False) + req = urlopen(url, session, max_content_bytes=MaxContentBytes, + raise_for_status=False) except Exception: # connect or timeout errors are treated as an absent robots.txt rp.allow_all = True @@ -329,8 +334,9 @@ def check_content_size(url, headers, max_content_bytes): if 'content-length' in headers: size = int(headers['content-length']) if size > max_content_bytes: - msg = 'URL content of %s with %d bytes exceeds %d bytes.' % (url, size, max_content_bytes) - raise IOError(msg) + raise IOError( + 'URL content of %s with %d bytes exceeds %d bytes.' % + (url, size, max_content_bytes)) def splitpath(path): @@ -388,7 +394,8 @@ I can work with ;) . print_proxy_info(out=out) print_locale_info(out=out) print(os.linesep, - "******** %s internal error, over and out ********" % AppName, file=out) + "******** %s internal error, over and out ********" % AppName, + file=out) def print_env_info(key, out=sys.stderr): @@ -414,7 +421,7 @@ def print_app_info(out=sys.stderr): print("System info:", file=out) print(App, file=out) print("Python %(version)s on %(platform)s" % - {"version": sys.version, "platform": sys.platform}, file=out) + {"version": sys.version, "platform": sys.platform}, file=out) stime = strtime(time.time()) print("Local time:", stime, file=out) print("sys.argv", sys.argv, file=out) @@ -422,8 +429,8 @@ def print_app_info(out=sys.stderr): def strtime(t): """Return ISO 8601 formatted time.""" - return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \ - strtimezone() + return (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + + strtimezone()) def strtimezone(): @@ -461,7 +468,7 @@ def quote(text, safechars='/'): return url_quote(text, safechars) -def strsize (b): +def strsize(b): """Return human representation of bytes b. A negative number of bytes raises a value error.""" if b < 0: @@ -487,7 +494,8 @@ def getDirname(name): def getFilename(name): - """Get a filename from given name without dangerous or incompatible characters.""" + """Get a filename from given name without dangerous or incompatible + characters.""" # first replace all illegal chars name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name) # then remove double dots and underscores @@ -532,7 +540,7 @@ def getNonexistingFile(name): return filename -def strlimit (s, length=72): +def strlimit(s, length=72): """If the length of the string exceeds the given limit, it will be cut off and three dots will be appended.