Add simple host-based throttling

This commit is contained in:
Tobias Gruetzmacher 2019-12-03 22:21:28 +01:00
parent e5e7dfacd6
commit a347bebfe3
2 changed files with 36 additions and 5 deletions

View file

@ -3,9 +3,14 @@
from __future__ import absolute_import, division, print_function
from collections import defaultdict
from random import uniform
from time import time, sleep
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from six.moves.urllib.parse import urlparse
from .configuration import UserAgent
@ -21,6 +26,13 @@ ConnectionTimeoutSecs = 60
class Session(requests.Session):
"""This session implements a very simple host-based throttling system: For
each hostname we see, we keep a record on when is the earliest time we want
to send the next request: If before sending a request this time isn't
reached, we sleep a bit until the requirements are satisfied. By default,
we only delay a random amount of at most 0.3sec - but some hosts might need
longer delays.
"""
def __init__(self):
super(Session, self).__init__()
@ -29,11 +41,35 @@ class Session(requests.Session):
self.mount('https://', HTTPAdapter(max_retries=retry))
self.headers.update({'User-Agent': UserAgent})
self.throttles = defaultdict(lambda: RandomThrottle(0.0, 0.3))
def send(self, request, **kwargs):
if 'timeout' not in kwargs:
kwargs['timeout'] = ConnectionTimeoutSecs
hostname = urlparse(request.url).hostname
self.throttles[hostname].delay()
return super(Session, self).send(request, **kwargs)
def add_throttle(self, hostname, th_min, th_max):
"""Adds a new throttle for a host: Might overwrite the existing one.
"""
self.throttles[hostname] = RandomThrottle(th_min, th_max)
class RandomThrottle(object):
def __init__(self, th_min, th_max):
self.th_min = th_min
self.th_max = th_max
self.next = time()
def delay(self):
d = self.next - time()
if d > 0:
sleep(d)
self.next = time() + uniform(self.th_min, self.th_max)
# A default session for cookie and connection sharing
default_session = Session()

View file

@ -5,8 +5,6 @@
from __future__ import absolute_import, division, print_function
import time
import random
import os
import re
from six.moves.urllib.parse import urljoin
@ -200,9 +198,6 @@ class Scraper(object):
out.warn(u"Already seen previous URL %r" % prevUrl)
break
url = prevUrl
if url:
# wait up to 2 seconds for next URL
time.sleep(1.0 + random.random())
def getPrevUrl(self, url, data):
"""Find previous URL."""