Let urllib3 handle all retries.
This commit is contained in:
parent
78e13962f9
commit
c4fcd985dd
2 changed files with 48 additions and 41 deletions
|
@ -3,7 +3,6 @@
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2014-2016 Tobias Gruetzmacher
|
# Copyright (C) 2014-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
import requests
|
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
|
@ -32,7 +31,7 @@ except ImportError:
|
||||||
from . import loader, configuration, languages
|
from . import loader, configuration, languages
|
||||||
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
|
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
|
||||||
getDirname, unescape, tagre, normaliseURL,
|
getDirname, unescape, tagre, normaliseURL,
|
||||||
prettyMatcherList)
|
prettyMatcherList, requests_session)
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
@ -88,8 +87,8 @@ class Scraper(object):
|
||||||
# usually the index format help
|
# usually the index format help
|
||||||
help = ''
|
help = ''
|
||||||
|
|
||||||
# HTTP session storing cookies
|
# HTTP session for configuration & cookies
|
||||||
session = requests.session()
|
session = requests_session()
|
||||||
|
|
||||||
def __init__(self, indexes=None):
|
def __init__(self, indexes=None):
|
||||||
"""Initialize internal variables."""
|
"""Initialize internal variables."""
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2014-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import division, print_function
|
from __future__ import division, print_function
|
||||||
try:
|
try:
|
||||||
from urllib.parse import quote as url_quote, unquote as url_unquote
|
from urllib.parse import quote as url_quote, unquote as url_unquote
|
||||||
|
@ -15,6 +17,11 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import robotparser
|
import robotparser
|
||||||
import requests
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
try:
|
||||||
|
from urllib3.util.retry import Retry
|
||||||
|
except ImportError:
|
||||||
|
from requests.packages.urllib3.util.retry import Retry
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import cgi
|
import cgi
|
||||||
|
@ -32,7 +39,7 @@ from .output import out
|
||||||
from .configuration import UserAgent, AppName, App, SupportUrl
|
from .configuration import UserAgent, AppName, App, SupportUrl
|
||||||
|
|
||||||
# Maximum content size for HTML pages
|
# Maximum content size for HTML pages
|
||||||
MaxContentBytes = 1024 * 1024 * 3 # 2 MB
|
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
|
||||||
|
|
||||||
# Maximum content size for images
|
# Maximum content size for images
|
||||||
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
||||||
|
@ -40,8 +47,9 @@ MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
||||||
# Default number of retries
|
# Default number of retries
|
||||||
MaxRetries = 3
|
MaxRetries = 3
|
||||||
|
|
||||||
# Time to pause between retries
|
# Factor for retry backoff (see urllib3.util.retry, this default means
|
||||||
RetryPauseSeconds = 5
|
# 2s, 4s, 8s)
|
||||||
|
RetryBackoffFactor = 2
|
||||||
|
|
||||||
# Default connection timeout
|
# Default connection timeout
|
||||||
ConnectionTimeoutSecs = 60
|
ConnectionTimeoutSecs = 60
|
||||||
|
@ -55,6 +63,14 @@ ConnectionTimeoutSecs = 60
|
||||||
UrlEncoding = "utf-8"
|
UrlEncoding = "utf-8"
|
||||||
|
|
||||||
|
|
||||||
|
def requests_session():
|
||||||
|
s = requests.Session()
|
||||||
|
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
|
||||||
|
s.mount('http://', HTTPAdapter(max_retries=retry))
|
||||||
|
s.mount('https://', HTTPAdapter(max_retries=retry))
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
def get_system_uid():
|
def get_system_uid():
|
||||||
"""Get a (probably) unique ID to identify a system.
|
"""Get a (probably) unique ID to identify a system.
|
||||||
Used to differentiate votes.
|
Used to differentiate votes.
|
||||||
|
@ -155,7 +171,9 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
|
||||||
prefix=prefix,
|
prefix=prefix,
|
||||||
after=after,
|
after=after,
|
||||||
)
|
)
|
||||||
return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
|
return (r'<\s*%(tag)s\s+%(prefix)s' +
|
||||||
|
r'%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)' +
|
||||||
|
r's[^>]*%(after)s[^>]*>') % attrs
|
||||||
|
|
||||||
|
|
||||||
def case_insensitive_re(name):
|
def case_insensitive_re(name):
|
||||||
|
@ -170,37 +188,20 @@ def case_insensitive_re(name):
|
||||||
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
||||||
|
|
||||||
|
|
||||||
def isValidPageContent(data):
|
|
||||||
"""Check if page content is empty or has error messages."""
|
|
||||||
# The python requests library sometimes returns empty data.
|
|
||||||
# Some webservers have a 200 OK status but have an error message as response.
|
|
||||||
return data and not data.startswith("Internal Server Error")
|
|
||||||
|
|
||||||
|
|
||||||
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
||||||
"""Get text content of given URL."""
|
"""Get text content of given URL."""
|
||||||
check_robotstxt(url, session)
|
check_robotstxt(url, session)
|
||||||
# read page data
|
# read page data
|
||||||
try:
|
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
|
||||||
except IOError:
|
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||||
data = page.text
|
data = page.text
|
||||||
tries = MaxRetries
|
|
||||||
while not isValidPageContent(data) and tries > 0:
|
|
||||||
time.sleep(RetryPauseSeconds)
|
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
|
||||||
data = page.text
|
|
||||||
tries -= 1
|
|
||||||
if not isValidPageContent(data):
|
|
||||||
raise ValueError("Got invalid page content from %s: %r" % (url, data))
|
|
||||||
out.debug(u"Got page content %r" % data, level=3)
|
out.debug(u"Got page content %r" % data, level=3)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
||||||
"""Get response object for given image URL."""
|
"""Get response object for given image URL."""
|
||||||
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes, stream=True)
|
return urlopen(url, session, referrer=referrer,
|
||||||
|
max_content_bytes=max_content_bytes, stream=True)
|
||||||
|
|
||||||
|
|
||||||
def makeSequence(item):
|
def makeSequence(item):
|
||||||
|
@ -224,6 +225,8 @@ def prettyMatcherList(things):
|
||||||
|
|
||||||
|
|
||||||
_htmlparser = HTMLParser()
|
_htmlparser = HTMLParser()
|
||||||
|
|
||||||
|
|
||||||
def unescape(text):
|
def unescape(text):
|
||||||
"""Replace HTML entities and character references."""
|
"""Replace HTML entities and character references."""
|
||||||
return _htmlparser.unescape(text)
|
return _htmlparser.unescape(text)
|
||||||
|
@ -231,6 +234,7 @@ def unescape(text):
|
||||||
|
|
||||||
_nopathquote_chars = "-;/=,~*+()@!"
|
_nopathquote_chars = "-;/=,~*+()@!"
|
||||||
|
|
||||||
|
|
||||||
def normaliseURL(url):
|
def normaliseURL(url):
|
||||||
"""Normalising
|
"""Normalising
|
||||||
- strips and leading or trailing whitespace,
|
- strips and leading or trailing whitespace,
|
||||||
|
@ -275,7 +279,8 @@ def get_robotstxt_parser(url, session=None):
|
||||||
"""Get a RobotFileParser for the given robots.txt URL."""
|
"""Get a RobotFileParser for the given robots.txt URL."""
|
||||||
rp = robotparser.RobotFileParser()
|
rp = robotparser.RobotFileParser()
|
||||||
try:
|
try:
|
||||||
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
req = urlopen(url, session, max_content_bytes=MaxContentBytes,
|
||||||
|
raise_for_status=False)
|
||||||
except Exception:
|
except Exception:
|
||||||
# connect or timeout errors are treated as an absent robots.txt
|
# connect or timeout errors are treated as an absent robots.txt
|
||||||
rp.allow_all = True
|
rp.allow_all = True
|
||||||
|
@ -329,8 +334,9 @@ def check_content_size(url, headers, max_content_bytes):
|
||||||
if 'content-length' in headers:
|
if 'content-length' in headers:
|
||||||
size = int(headers['content-length'])
|
size = int(headers['content-length'])
|
||||||
if size > max_content_bytes:
|
if size > max_content_bytes:
|
||||||
msg = 'URL content of %s with %d bytes exceeds %d bytes.' % (url, size, max_content_bytes)
|
raise IOError(
|
||||||
raise IOError(msg)
|
'URL content of %s with %d bytes exceeds %d bytes.' %
|
||||||
|
(url, size, max_content_bytes))
|
||||||
|
|
||||||
|
|
||||||
def splitpath(path):
|
def splitpath(path):
|
||||||
|
@ -388,7 +394,8 @@ I can work with ;) .
|
||||||
print_proxy_info(out=out)
|
print_proxy_info(out=out)
|
||||||
print_locale_info(out=out)
|
print_locale_info(out=out)
|
||||||
print(os.linesep,
|
print(os.linesep,
|
||||||
"******** %s internal error, over and out ********" % AppName, file=out)
|
"******** %s internal error, over and out ********" % AppName,
|
||||||
|
file=out)
|
||||||
|
|
||||||
|
|
||||||
def print_env_info(key, out=sys.stderr):
|
def print_env_info(key, out=sys.stderr):
|
||||||
|
@ -422,8 +429,8 @@ def print_app_info(out=sys.stderr):
|
||||||
|
|
||||||
def strtime(t):
|
def strtime(t):
|
||||||
"""Return ISO 8601 formatted time."""
|
"""Return ISO 8601 formatted time."""
|
||||||
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
|
return (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) +
|
||||||
strtimezone()
|
strtimezone())
|
||||||
|
|
||||||
|
|
||||||
def strtimezone():
|
def strtimezone():
|
||||||
|
@ -487,7 +494,8 @@ def getDirname(name):
|
||||||
|
|
||||||
|
|
||||||
def getFilename(name):
|
def getFilename(name):
|
||||||
"""Get a filename from given name without dangerous or incompatible characters."""
|
"""Get a filename from given name without dangerous or incompatible
|
||||||
|
characters."""
|
||||||
# first replace all illegal chars
|
# first replace all illegal chars
|
||||||
name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
|
name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
|
||||||
# then remove double dots and underscores
|
# then remove double dots and underscores
|
||||||
|
|
Loading…
Reference in a new issue