Add simple host-based throttling
This commit is contained in:
parent
e5e7dfacd6
commit
a347bebfe3
2 changed files with 36 additions and 5 deletions
|
@ -3,9 +3,14 @@
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from random import uniform
|
||||||
|
from time import time, sleep
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
from requests.packages.urllib3.util.retry import Retry
|
from requests.packages.urllib3.util.retry import Retry
|
||||||
|
from six.moves.urllib.parse import urlparse
|
||||||
|
|
||||||
from .configuration import UserAgent
|
from .configuration import UserAgent
|
||||||
|
|
||||||
|
@ -21,6 +26,13 @@ ConnectionTimeoutSecs = 60
|
||||||
|
|
||||||
|
|
||||||
class Session(requests.Session):
|
class Session(requests.Session):
|
||||||
|
"""This session implements a very simple host-based throttling system: For
|
||||||
|
each hostname we see, we keep a record on when is the earliest time we want
|
||||||
|
to send the next request: If before sending a request this time isn't
|
||||||
|
reached, we sleep a bit until the requirements are satisfied. By default,
|
||||||
|
we only delay a random amount of at most 0.3sec - but some hosts might need
|
||||||
|
longer delays.
|
||||||
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(Session, self).__init__()
|
super(Session, self).__init__()
|
||||||
|
|
||||||
|
@ -29,11 +41,35 @@ class Session(requests.Session):
|
||||||
self.mount('https://', HTTPAdapter(max_retries=retry))
|
self.mount('https://', HTTPAdapter(max_retries=retry))
|
||||||
self.headers.update({'User-Agent': UserAgent})
|
self.headers.update({'User-Agent': UserAgent})
|
||||||
|
|
||||||
|
self.throttles = defaultdict(lambda: RandomThrottle(0.0, 0.3))
|
||||||
|
|
||||||
def send(self, request, **kwargs):
|
def send(self, request, **kwargs):
|
||||||
if 'timeout' not in kwargs:
|
if 'timeout' not in kwargs:
|
||||||
kwargs['timeout'] = ConnectionTimeoutSecs
|
kwargs['timeout'] = ConnectionTimeoutSecs
|
||||||
|
|
||||||
|
hostname = urlparse(request.url).hostname
|
||||||
|
self.throttles[hostname].delay()
|
||||||
|
|
||||||
return super(Session, self).send(request, **kwargs)
|
return super(Session, self).send(request, **kwargs)
|
||||||
|
|
||||||
|
def add_throttle(self, hostname, th_min, th_max):
|
||||||
|
"""Adds a new throttle for a host: Might overwrite the existing one.
|
||||||
|
"""
|
||||||
|
self.throttles[hostname] = RandomThrottle(th_min, th_max)
|
||||||
|
|
||||||
|
|
||||||
|
class RandomThrottle(object):
|
||||||
|
def __init__(self, th_min, th_max):
|
||||||
|
self.th_min = th_min
|
||||||
|
self.th_max = th_max
|
||||||
|
self.next = time()
|
||||||
|
|
||||||
|
def delay(self):
|
||||||
|
d = self.next - time()
|
||||||
|
if d > 0:
|
||||||
|
sleep(d)
|
||||||
|
self.next = time() + uniform(self.th_min, self.th_max)
|
||||||
|
|
||||||
|
|
||||||
# A default session for cookie and connection sharing
|
# A default session for cookie and connection sharing
|
||||||
default_session = Session()
|
default_session = Session()
|
||||||
|
|
|
@ -5,8 +5,6 @@
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from six.moves.urllib.parse import urljoin
|
from six.moves.urllib.parse import urljoin
|
||||||
|
@ -200,9 +198,6 @@ class Scraper(object):
|
||||||
out.warn(u"Already seen previous URL %r" % prevUrl)
|
out.warn(u"Already seen previous URL %r" % prevUrl)
|
||||||
break
|
break
|
||||||
url = prevUrl
|
url = prevUrl
|
||||||
if url:
|
|
||||||
# wait up to 2 seconds for next URL
|
|
||||||
time.sleep(1.0 + random.random())
|
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
"""Find previous URL."""
|
"""Find previous URL."""
|
||||||
|
|
Loading…
Reference in a new issue