Let urllib3 handle all retries.

This commit is contained in:
Tobias Gruetzmacher 2016-03-13 21:27:31 +01:00
parent 78e13962f9
commit c4fcd985dd
2 changed files with 48 additions and 41 deletions

View file

@ -3,7 +3,6 @@
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2014-2016 Tobias Gruetzmacher # Copyright (C) 2014-2016 Tobias Gruetzmacher
import requests
import time import time
import random import random
import os import os
@ -32,7 +31,7 @@ except ImportError:
from . import loader, configuration, languages from . import loader, configuration, languages
from .util import (getPageContent, makeSequence, get_system_uid, urlopen, from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
getDirname, unescape, tagre, normaliseURL, getDirname, unescape, tagre, normaliseURL,
prettyMatcherList) prettyMatcherList, requests_session)
from .comic import ComicStrip from .comic import ComicStrip
from .output import out from .output import out
from .events import getHandler from .events import getHandler
@ -88,8 +87,8 @@ class Scraper(object):
# usually the index format help # usually the index format help
help = '' help = ''
# HTTP session storing cookies # HTTP session for configuration & cookies
session = requests.session() session = requests_session()
def __init__(self, indexes=None): def __init__(self, indexes=None):
"""Initialize internal variables.""" """Initialize internal variables."""

View file

@ -1,6 +1,8 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2014-2016 Tobias Gruetzmacher
from __future__ import division, print_function from __future__ import division, print_function
try: try:
from urllib.parse import quote as url_quote, unquote as url_unquote from urllib.parse import quote as url_quote, unquote as url_unquote
@ -15,6 +17,11 @@ try:
except ImportError: except ImportError:
import robotparser import robotparser
import requests import requests
from requests.adapters import HTTPAdapter
try:
from urllib3.util.retry import Retry
except ImportError:
from requests.packages.urllib3.util.retry import Retry
import sys import sys
import os import os
import cgi import cgi
@ -32,7 +39,7 @@ from .output import out
from .configuration import UserAgent, AppName, App, SupportUrl from .configuration import UserAgent, AppName, App, SupportUrl
# Maximum content size for HTML pages # Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 3 # 2 MB MaxContentBytes = 1024 * 1024 * 3 # 3 MB
# Maximum content size for images # Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB MaxImageBytes = 1024 * 1024 * 20 # 20 MB
@ -40,8 +47,9 @@ MaxImageBytes = 1024 * 1024 * 20 # 20 MB
# Default number of retries # Default number of retries
MaxRetries = 3 MaxRetries = 3
# Time to pause between retries # Factor for retry backoff (see urllib3.util.retry, this default means
RetryPauseSeconds = 5 # 2s, 4s, 8s)
RetryBackoffFactor = 2
# Default connection timeout # Default connection timeout
ConnectionTimeoutSecs = 60 ConnectionTimeoutSecs = 60
@ -55,6 +63,14 @@ ConnectionTimeoutSecs = 60
UrlEncoding = "utf-8" UrlEncoding = "utf-8"
def requests_session():
s = requests.Session()
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
s.mount('http://', HTTPAdapter(max_retries=retry))
s.mount('https://', HTTPAdapter(max_retries=retry))
return s
def get_system_uid(): def get_system_uid():
"""Get a (probably) unique ID to identify a system. """Get a (probably) unique ID to identify a system.
Used to differentiate votes. Used to differentiate votes.
@ -107,7 +123,7 @@ def get_mac_uid():
return "%d" % uuid.getnode() return "%d" % uuid.getnode()
def backtick (cmd, encoding='utf-8'): def backtick(cmd, encoding='utf-8'):
"""Return decoded output from command.""" """Return decoded output from command."""
data = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0] data = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0]
return data.decode(encoding) return data.decode(encoding)
@ -155,7 +171,9 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
prefix=prefix, prefix=prefix,
after=after, after=after,
) )
return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs return (r'<\s*%(tag)s\s+%(prefix)s' +
r'%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)' +
r's[^>]*%(after)s[^>]*>') % attrs
def case_insensitive_re(name): def case_insensitive_re(name):
@ -170,37 +188,20 @@ def case_insensitive_re(name):
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name) return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
def isValidPageContent(data):
"""Check if page content is empty or has error messages."""
# The python requests library sometimes returns empty data.
# Some webservers have a 200 OK status but have an error message as response.
return data and not data.startswith("Internal Server Error")
def getPageContent(url, session, max_content_bytes=MaxContentBytes): def getPageContent(url, session, max_content_bytes=MaxContentBytes):
"""Get text content of given URL.""" """Get text content of given URL."""
check_robotstxt(url, session) check_robotstxt(url, session)
# read page data # read page data
try:
page = urlopen(url, session, max_content_bytes=max_content_bytes)
except IOError:
page = urlopen(url, session, max_content_bytes=max_content_bytes) page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text data = page.text
tries = MaxRetries
while not isValidPageContent(data) and tries > 0:
time.sleep(RetryPauseSeconds)
page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text
tries -= 1
if not isValidPageContent(data):
raise ValueError("Got invalid page content from %s: %r" % (url, data))
out.debug(u"Got page content %r" % data, level=3) out.debug(u"Got page content %r" % data, level=3)
return data return data
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes): def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
"""Get response object for given image URL.""" """Get response object for given image URL."""
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes, stream=True) return urlopen(url, session, referrer=referrer,
max_content_bytes=max_content_bytes, stream=True)
def makeSequence(item): def makeSequence(item):
@ -224,6 +225,8 @@ def prettyMatcherList(things):
_htmlparser = HTMLParser() _htmlparser = HTMLParser()
def unescape(text): def unescape(text):
"""Replace HTML entities and character references.""" """Replace HTML entities and character references."""
return _htmlparser.unescape(text) return _htmlparser.unescape(text)
@ -231,6 +234,7 @@ def unescape(text):
_nopathquote_chars = "-;/=,~*+()@!" _nopathquote_chars = "-;/=,~*+()@!"
def normaliseURL(url): def normaliseURL(url):
"""Normalising """Normalising
- strips and leading or trailing whitespace, - strips and leading or trailing whitespace,
@ -275,7 +279,8 @@ def get_robotstxt_parser(url, session=None):
"""Get a RobotFileParser for the given robots.txt URL.""" """Get a RobotFileParser for the given robots.txt URL."""
rp = robotparser.RobotFileParser() rp = robotparser.RobotFileParser()
try: try:
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False) req = urlopen(url, session, max_content_bytes=MaxContentBytes,
raise_for_status=False)
except Exception: except Exception:
# connect or timeout errors are treated as an absent robots.txt # connect or timeout errors are treated as an absent robots.txt
rp.allow_all = True rp.allow_all = True
@ -329,8 +334,9 @@ def check_content_size(url, headers, max_content_bytes):
if 'content-length' in headers: if 'content-length' in headers:
size = int(headers['content-length']) size = int(headers['content-length'])
if size > max_content_bytes: if size > max_content_bytes:
msg = 'URL content of %s with %d bytes exceeds %d bytes.' % (url, size, max_content_bytes) raise IOError(
raise IOError(msg) 'URL content of %s with %d bytes exceeds %d bytes.' %
(url, size, max_content_bytes))
def splitpath(path): def splitpath(path):
@ -388,7 +394,8 @@ I can work with ;) .
print_proxy_info(out=out) print_proxy_info(out=out)
print_locale_info(out=out) print_locale_info(out=out)
print(os.linesep, print(os.linesep,
"******** %s internal error, over and out ********" % AppName, file=out) "******** %s internal error, over and out ********" % AppName,
file=out)
def print_env_info(key, out=sys.stderr): def print_env_info(key, out=sys.stderr):
@ -422,8 +429,8 @@ def print_app_info(out=sys.stderr):
def strtime(t): def strtime(t):
"""Return ISO 8601 formatted time.""" """Return ISO 8601 formatted time."""
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \ return (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) +
strtimezone() strtimezone())
def strtimezone(): def strtimezone():
@ -461,7 +468,7 @@ def quote(text, safechars='/'):
return url_quote(text, safechars) return url_quote(text, safechars)
def strsize (b): def strsize(b):
"""Return human representation of bytes b. A negative number of bytes """Return human representation of bytes b. A negative number of bytes
raises a value error.""" raises a value error."""
if b < 0: if b < 0:
@ -487,7 +494,8 @@ def getDirname(name):
def getFilename(name): def getFilename(name):
"""Get a filename from given name without dangerous or incompatible characters.""" """Get a filename from given name without dangerous or incompatible
characters."""
# first replace all illegal chars # first replace all illegal chars
name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name) name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
# then remove double dots and underscores # then remove double dots and underscores
@ -532,7 +540,7 @@ def getNonexistingFile(name):
return filename return filename
def strlimit (s, length=72): def strlimit(s, length=72):
"""If the length of the string exceeds the given limit, it will be cut """If the length of the string exceeds the given limit, it will be cut
off and three dots will be appended. off and three dots will be appended.