Add simple host-based throttling

This commit is contained in:
Tobias Gruetzmacher 2019-12-03 22:21:28 +01:00
parent e5e7dfacd6
commit a347bebfe3
2 changed files with 36 additions and 5 deletions

View file

@ -3,9 +3,14 @@
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
from collections import defaultdict
from random import uniform
from time import time, sleep
import requests import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
from six.moves.urllib.parse import urlparse
from .configuration import UserAgent from .configuration import UserAgent
@ -21,6 +26,13 @@ ConnectionTimeoutSecs = 60
class Session(requests.Session): class Session(requests.Session):
"""This session implements a very simple host-based throttling system: For
each hostname we see, we keep a record on when is the earliest time we want
to send the next request: If before sending a request this time isn't
reached, we sleep a bit until the requirements are satisfied. By default,
we only delay a random amount of at most 0.3sec - but some hosts might need
longer delays.
"""
def __init__(self): def __init__(self):
super(Session, self).__init__() super(Session, self).__init__()
@ -29,11 +41,35 @@ class Session(requests.Session):
self.mount('https://', HTTPAdapter(max_retries=retry)) self.mount('https://', HTTPAdapter(max_retries=retry))
self.headers.update({'User-Agent': UserAgent}) self.headers.update({'User-Agent': UserAgent})
self.throttles = defaultdict(lambda: RandomThrottle(0.0, 0.3))
def send(self, request, **kwargs): def send(self, request, **kwargs):
if 'timeout' not in kwargs: if 'timeout' not in kwargs:
kwargs['timeout'] = ConnectionTimeoutSecs kwargs['timeout'] = ConnectionTimeoutSecs
hostname = urlparse(request.url).hostname
self.throttles[hostname].delay()
return super(Session, self).send(request, **kwargs) return super(Session, self).send(request, **kwargs)
def add_throttle(self, hostname, th_min, th_max):
"""Adds a new throttle for a host: Might overwrite the existing one.
"""
self.throttles[hostname] = RandomThrottle(th_min, th_max)
class RandomThrottle(object):
def __init__(self, th_min, th_max):
self.th_min = th_min
self.th_max = th_max
self.next = time()
def delay(self):
d = self.next - time()
if d > 0:
sleep(d)
self.next = time() + uniform(self.th_min, self.th_max)
# A default session for cookie and connection sharing # A default session for cookie and connection sharing
default_session = Session() default_session = Session()

View file

@ -5,8 +5,6 @@
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import time
import random
import os import os
import re import re
from six.moves.urllib.parse import urljoin from six.moves.urllib.parse import urljoin
@ -200,9 +198,6 @@ class Scraper(object):
out.warn(u"Already seen previous URL %r" % prevUrl) out.warn(u"Already seen previous URL %r" % prevUrl)
break break
url = prevUrl url = prevUrl
if url:
# wait up to 2 seconds for next URL
time.sleep(1.0 + random.random())
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
"""Find previous URL.""" """Find previous URL."""