Let urllib3 handle all retries.

This commit is contained in:
Tobias Gruetzmacher 2016-03-13 21:27:31 +01:00
parent 78e13962f9
commit c4fcd985dd
2 changed files with 48 additions and 41 deletions

View file

@ -3,7 +3,6 @@
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2014-2016 Tobias Gruetzmacher
import requests
import time
import random
import os
@ -32,7 +31,7 @@ except ImportError:
from . import loader, configuration, languages
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
getDirname, unescape, tagre, normaliseURL,
prettyMatcherList)
prettyMatcherList, requests_session)
from .comic import ComicStrip
from .output import out
from .events import getHandler
@ -88,8 +87,8 @@ class Scraper(object):
# usually the index format help
help = ''
# HTTP session storing cookies
session = requests.session()
# HTTP session for configuration & cookies
session = requests_session()
def __init__(self, indexes=None):
"""Initialize internal variables."""

View file

@ -1,6 +1,8 @@
# -*- coding: iso-8859-1 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2014-2016 Tobias Gruetzmacher
from __future__ import division, print_function
try:
from urllib.parse import quote as url_quote, unquote as url_unquote
@ -15,6 +17,11 @@ try:
except ImportError:
import robotparser
import requests
from requests.adapters import HTTPAdapter
try:
from urllib3.util.retry import Retry
except ImportError:
from requests.packages.urllib3.util.retry import Retry
import sys
import os
import cgi
@ -32,7 +39,7 @@ from .output import out
from .configuration import UserAgent, AppName, App, SupportUrl
# Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 3 # 2 MB
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
# Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
@ -40,8 +47,9 @@ MaxImageBytes = 1024 * 1024 * 20 # 20 MB
# Default number of retries
MaxRetries = 3
# Time to pause between retries
RetryPauseSeconds = 5
# Factor for retry backoff (see urllib3.util.retry, this default means
# 2s, 4s, 8s)
RetryBackoffFactor = 2
# Default connection timeout
ConnectionTimeoutSecs = 60
@ -55,6 +63,14 @@ ConnectionTimeoutSecs = 60
UrlEncoding = "utf-8"
def requests_session():
s = requests.Session()
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
s.mount('http://', HTTPAdapter(max_retries=retry))
s.mount('https://', HTTPAdapter(max_retries=retry))
return s
def get_system_uid():
"""Get a (probably) unique ID to identify a system.
Used to differentiate votes.
@ -155,7 +171,9 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
prefix=prefix,
after=after,
)
return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
return (r'<\s*%(tag)s\s+%(prefix)s' +
r'%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)' +
r's[^>]*%(after)s[^>]*>') % attrs
def case_insensitive_re(name):
@ -170,37 +188,20 @@ def case_insensitive_re(name):
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
def isValidPageContent(data):
"""Check if page content is empty or has error messages."""
# The python requests library sometimes returns empty data.
# Some webservers have a 200 OK status but have an error message as response.
return data and not data.startswith("Internal Server Error")
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
"""Get text content of given URL."""
check_robotstxt(url, session)
# read page data
try:
page = urlopen(url, session, max_content_bytes=max_content_bytes)
except IOError:
page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text
tries = MaxRetries
while not isValidPageContent(data) and tries > 0:
time.sleep(RetryPauseSeconds)
page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text
tries -= 1
if not isValidPageContent(data):
raise ValueError("Got invalid page content from %s: %r" % (url, data))
out.debug(u"Got page content %r" % data, level=3)
return data
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
"""Get response object for given image URL."""
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes, stream=True)
return urlopen(url, session, referrer=referrer,
max_content_bytes=max_content_bytes, stream=True)
def makeSequence(item):
@ -224,6 +225,8 @@ def prettyMatcherList(things):
_htmlparser = HTMLParser()
def unescape(text):
"""Replace HTML entities and character references."""
return _htmlparser.unescape(text)
@ -231,6 +234,7 @@ def unescape(text):
_nopathquote_chars = "-;/=,~*+()@!"
def normaliseURL(url):
"""Normalising
- strips and leading or trailing whitespace,
@ -275,7 +279,8 @@ def get_robotstxt_parser(url, session=None):
"""Get a RobotFileParser for the given robots.txt URL."""
rp = robotparser.RobotFileParser()
try:
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
req = urlopen(url, session, max_content_bytes=MaxContentBytes,
raise_for_status=False)
except Exception:
# connect or timeout errors are treated as an absent robots.txt
rp.allow_all = True
@ -329,8 +334,9 @@ def check_content_size(url, headers, max_content_bytes):
if 'content-length' in headers:
size = int(headers['content-length'])
if size > max_content_bytes:
msg = 'URL content of %s with %d bytes exceeds %d bytes.' % (url, size, max_content_bytes)
raise IOError(msg)
raise IOError(
'URL content of %s with %d bytes exceeds %d bytes.' %
(url, size, max_content_bytes))
def splitpath(path):
@ -388,7 +394,8 @@ I can work with ;) .
print_proxy_info(out=out)
print_locale_info(out=out)
print(os.linesep,
"******** %s internal error, over and out ********" % AppName, file=out)
"******** %s internal error, over and out ********" % AppName,
file=out)
def print_env_info(key, out=sys.stderr):
@ -422,8 +429,8 @@ def print_app_info(out=sys.stderr):
def strtime(t):
"""Return ISO 8601 formatted time."""
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
strtimezone()
return (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) +
strtimezone())
def strtimezone():
@ -487,7 +494,8 @@ def getDirname(name):
def getFilename(name):
"""Get a filename from given name without dangerous or incompatible characters."""
"""Get a filename from given name without dangerous or incompatible
characters."""
# first replace all illegal chars
name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
# then remove double dots and underscores