Enhance default session with host-specific options

This makes much more sense then building custom sessions inside specific
modules.
This commit is contained in:
Tobias Gruetzmacher 2022-02-15 00:18:48 +01:00
parent 6041c1e03b
commit 45162bf7f2
3 changed files with 17 additions and 16 deletions

View file

View file

@ -31,7 +31,7 @@ class Session(requests.Session):
longer delays. longer delays.
""" """
def __init__(self): def __init__(self):
super(Session, self).__init__() super().__init__()
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor) retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
self.mount('http://', HTTPAdapter(max_retries=retry)) self.mount('http://', HTTPAdapter(max_retries=retry))
@ -39,6 +39,7 @@ class Session(requests.Session):
self.headers.update({'User-Agent': UserAgent}) self.headers.update({'User-Agent': UserAgent})
self.throttles = collections.defaultdict(lambda: RandomThrottle()) self.throttles = collections.defaultdict(lambda: RandomThrottle())
self.host_options = {}
def send(self, request, **kwargs): def send(self, request, **kwargs):
if 'timeout' not in kwargs: if 'timeout' not in kwargs:
@ -46,6 +47,8 @@ class Session(requests.Session):
hostname = urlparse(request.url).hostname hostname = urlparse(request.url).hostname
self.throttles[hostname].delay() self.throttles[hostname].delay()
if hostname in self.host_options:
kwargs.update(self.host_options[hostname])
return super(Session, self).send(request, **kwargs) return super(Session, self).send(request, **kwargs)
@ -54,6 +57,11 @@ class Session(requests.Session):
""" """
self.throttles[hostname] = RandomThrottle(th_min, th_max) self.throttles[hostname] = RandomThrottle(th_min, th_max)
def add_host_options(self, hostname, options):
"""Adds custom options for a specific host: Might overwrite the existing one.
"""
self.host_options[hostname] = options
class RandomThrottle(object): class RandomThrottle(object):
def __init__(self, th_min=0.0, th_max=0.3): def __init__(self, th_min=0.0, th_max=0.3):

View file

@ -3,15 +3,10 @@
# Copyright (C) 2019 Thomas W. Littauer # Copyright (C) 2019 Thomas W. Littauer
import re import re
try:
from functools import cached_property
except ImportError:
from cached_property import cached_property
from importlib.resources import path as get_path from importlib.resources import path as get_path
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..helpers import bounceStarter, joinPathPartsNamer from ..helpers import bounceStarter, joinPathPartsNamer
from ..http import Session
class ComicsKingdom(_BasicScraper): class ComicsKingdom(_BasicScraper):
@ -22,21 +17,19 @@ class ComicsKingdom(_BasicScraper):
namer = joinPathPartsNamer((-2, -1), ()) namer = joinPathPartsNamer((-2, -1), ())
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
@cached_property
def session(self):
'''Use our own isolated session (ComicsKingdom screws up their TLS setup
from time to time, this should "fix" it)'''
s = Session()
# slightly iffy hack taken from certifi
self.cert_ctx = get_path("dosagelib.data", "godaddy-bundle-g2-2031.pem")
s.verify = str(self.cert_ctx.__enter__())
return s
def __init__(self, name, path): def __init__(self, name, path):
super(ComicsKingdom, self).__init__('ComicsKingdom/' + name) super(ComicsKingdom, self).__init__('ComicsKingdom/' + name)
self.url = 'https://comicskingdom.com/' + path self.url = 'https://comicskingdom.com/' + path
self.stripUrl = self.url + '/%s' self.stripUrl = self.url + '/%s'
# slightly iffy hack taken from certifi
# We need or own certificate bundle since ComicsKingdom screws up their
# TLS setup from time to time, this should "fix" it)
self.cert_ctx = get_path('dosagelib.data', 'godaddy-bundle-g2-2031.pem')
self.session.add_host_options('comicskingdom.com', {
'verify': str(self.cert_ctx.__enter__()),
})
def link_modifier(self, url, tourl): def link_modifier(self, url, tourl):
if self.url not in tourl: if self.url not in tourl:
tourl = self.url + '/' + tourl.rsplit("/", 1)[1] tourl = self.url + '/' + tourl.rsplit("/", 1)[1]