# -*- coding: utf-8 -*- # Copyright (C) 2019-2020 Tobias Gruetzmacher import collections import random import time import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from urllib.parse import urlparse from .configuration import UserAgent # Default number of retries MaxRetries = 3 # Factor for retry backoff (see urllib3.util.retry, this default means # 2s, 4s, 8s) RetryBackoffFactor = 2 # Default connection timeout ConnectionTimeoutSecs = 60 class Session(requests.Session): """This session implements a very simple host-based throttling system: For each hostname we see, we keep a record on when is the earliest time we want to send the next request: If before sending a request this time isn't reached, we sleep a bit until the requirements are satisfied. By default, we only delay a random amount of at most 0.3sec - but some hosts might need longer delays. """ def __init__(self): super(Session, self).__init__() retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor) self.mount('http://', HTTPAdapter(max_retries=retry)) self.mount('https://', HTTPAdapter(max_retries=retry)) self.headers.update({'User-Agent': UserAgent}) self.throttles = collections.defaultdict(lambda: RandomThrottle()) def send(self, request, **kwargs): if 'timeout' not in kwargs: kwargs['timeout'] = ConnectionTimeoutSecs hostname = urlparse(request.url).hostname self.throttles[hostname].delay() return super(Session, self).send(request, **kwargs) def add_throttle(self, hostname, th_min, th_max): """Adds a new throttle for a host: Might overwrite the existing one. """ self.throttles[hostname] = RandomThrottle(th_min, th_max) class RandomThrottle(object): def __init__(self, th_min=0.0, th_max=0.3): self.th_min = th_min self.th_max = th_max self.next = time.time() def delay(self): d = self.next - time.time() if d > 0: time.sleep(d) self.next = time.time() + random.uniform(self.th_min, self.th_max) # A default session for cookie and connection sharing default_session = Session()