Enhance default session with host-specific options
This makes much more sense then building custom sessions inside specific modules.
This commit is contained in:
parent
6041c1e03b
commit
45162bf7f2
3 changed files with 17 additions and 16 deletions
0
dosagelib/data/__init__.py
Normal file
0
dosagelib/data/__init__.py
Normal file
|
@ -31,7 +31,7 @@ class Session(requests.Session):
|
||||||
longer delays.
|
longer delays.
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(Session, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
|
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
|
||||||
self.mount('http://', HTTPAdapter(max_retries=retry))
|
self.mount('http://', HTTPAdapter(max_retries=retry))
|
||||||
|
@ -39,6 +39,7 @@ class Session(requests.Session):
|
||||||
self.headers.update({'User-Agent': UserAgent})
|
self.headers.update({'User-Agent': UserAgent})
|
||||||
|
|
||||||
self.throttles = collections.defaultdict(lambda: RandomThrottle())
|
self.throttles = collections.defaultdict(lambda: RandomThrottle())
|
||||||
|
self.host_options = {}
|
||||||
|
|
||||||
def send(self, request, **kwargs):
|
def send(self, request, **kwargs):
|
||||||
if 'timeout' not in kwargs:
|
if 'timeout' not in kwargs:
|
||||||
|
@ -46,6 +47,8 @@ class Session(requests.Session):
|
||||||
|
|
||||||
hostname = urlparse(request.url).hostname
|
hostname = urlparse(request.url).hostname
|
||||||
self.throttles[hostname].delay()
|
self.throttles[hostname].delay()
|
||||||
|
if hostname in self.host_options:
|
||||||
|
kwargs.update(self.host_options[hostname])
|
||||||
|
|
||||||
return super(Session, self).send(request, **kwargs)
|
return super(Session, self).send(request, **kwargs)
|
||||||
|
|
||||||
|
@ -54,6 +57,11 @@ class Session(requests.Session):
|
||||||
"""
|
"""
|
||||||
self.throttles[hostname] = RandomThrottle(th_min, th_max)
|
self.throttles[hostname] = RandomThrottle(th_min, th_max)
|
||||||
|
|
||||||
|
def add_host_options(self, hostname, options):
|
||||||
|
"""Adds custom options for a specific host: Might overwrite the existing one.
|
||||||
|
"""
|
||||||
|
self.host_options[hostname] = options
|
||||||
|
|
||||||
|
|
||||||
class RandomThrottle(object):
|
class RandomThrottle(object):
|
||||||
def __init__(self, th_min=0.0, th_max=0.3):
|
def __init__(self, th_min=0.0, th_max=0.3):
|
||||||
|
|
|
@ -3,15 +3,10 @@
|
||||||
# Copyright (C) 2019 Thomas W. Littauer
|
# Copyright (C) 2019 Thomas W. Littauer
|
||||||
import re
|
import re
|
||||||
|
|
||||||
try:
|
|
||||||
from functools import cached_property
|
|
||||||
except ImportError:
|
|
||||||
from cached_property import cached_property
|
|
||||||
from importlib.resources import path as get_path
|
from importlib.resources import path as get_path
|
||||||
|
|
||||||
from ..scraper import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
from ..helpers import bounceStarter, joinPathPartsNamer
|
from ..helpers import bounceStarter, joinPathPartsNamer
|
||||||
from ..http import Session
|
|
||||||
|
|
||||||
|
|
||||||
class ComicsKingdom(_BasicScraper):
|
class ComicsKingdom(_BasicScraper):
|
||||||
|
@ -22,21 +17,19 @@ class ComicsKingdom(_BasicScraper):
|
||||||
namer = joinPathPartsNamer((-2, -1), ())
|
namer = joinPathPartsNamer((-2, -1), ())
|
||||||
help = 'Index format: yyyy-mm-dd'
|
help = 'Index format: yyyy-mm-dd'
|
||||||
|
|
||||||
@cached_property
|
|
||||||
def session(self):
|
|
||||||
'''Use our own isolated session (ComicsKingdom screws up their TLS setup
|
|
||||||
from time to time, this should "fix" it)'''
|
|
||||||
s = Session()
|
|
||||||
# slightly iffy hack taken from certifi
|
|
||||||
self.cert_ctx = get_path("dosagelib.data", "godaddy-bundle-g2-2031.pem")
|
|
||||||
s.verify = str(self.cert_ctx.__enter__())
|
|
||||||
return s
|
|
||||||
|
|
||||||
def __init__(self, name, path):
|
def __init__(self, name, path):
|
||||||
super(ComicsKingdom, self).__init__('ComicsKingdom/' + name)
|
super(ComicsKingdom, self).__init__('ComicsKingdom/' + name)
|
||||||
self.url = 'https://comicskingdom.com/' + path
|
self.url = 'https://comicskingdom.com/' + path
|
||||||
self.stripUrl = self.url + '/%s'
|
self.stripUrl = self.url + '/%s'
|
||||||
|
|
||||||
|
# slightly iffy hack taken from certifi
|
||||||
|
# We need or own certificate bundle since ComicsKingdom screws up their
|
||||||
|
# TLS setup from time to time, this should "fix" it)
|
||||||
|
self.cert_ctx = get_path('dosagelib.data', 'godaddy-bundle-g2-2031.pem')
|
||||||
|
self.session.add_host_options('comicskingdom.com', {
|
||||||
|
'verify': str(self.cert_ctx.__enter__()),
|
||||||
|
})
|
||||||
|
|
||||||
def link_modifier(self, url, tourl):
|
def link_modifier(self, url, tourl):
|
||||||
if self.url not in tourl:
|
if self.url not in tourl:
|
||||||
tourl = self.url + '/' + tourl.rsplit("/", 1)[1]
|
tourl = self.url + '/' + tourl.rsplit("/", 1)[1]
|
||||||
|
|
Loading…
Reference in a new issue