Move basic HTTP setup into a new module

We now subclass requests' Session to make further extensions of the HTTP
flow possible.
This commit is contained in:
Tobias Gruetzmacher 2019-12-03 20:27:37 +01:00
parent fcebd63e66
commit e5e7dfacd6
5 changed files with 55 additions and 37 deletions

39
dosagelib/http.py Normal file
View file

@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from .configuration import UserAgent
# Default number of retries
MaxRetries = 3
# Factor for retry backoff (see urllib3.util.retry, this default means
# 2s, 4s, 8s)
RetryBackoffFactor = 2
# Default connection timeout
ConnectionTimeoutSecs = 60
class Session(requests.Session):
def __init__(self):
super(Session, self).__init__()
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
self.mount('http://', HTTPAdapter(max_retries=retry))
self.mount('https://', HTTPAdapter(max_retries=retry))
self.headers.update({'User-Agent': UserAgent})
def send(self, request, **kwargs):
if 'timeout' not in kwargs:
kwargs['timeout'] = ConnectionTimeoutSecs
return super(Session, self).send(request, **kwargs)
# A default session for cookie and connection sharing
default_session = Session()

View file

@ -24,9 +24,9 @@ try:
except ImportError: except ImportError:
pycountry = None pycountry = None
from . import loader, configuration, languages from . import configuration, http, languages, loader
from .util import (get_page, makeSequence, get_system_uid, unescape, tagre, from .util import (get_page, makeSequence, get_system_uid, unescape, tagre,
normaliseURL, prettyMatcherList, requests_session, uniq) normaliseURL, prettyMatcherList, uniq)
from .comic import ComicStrip from .comic import ComicStrip
from .output import out from .output import out
from .events import getHandler from .events import getHandler
@ -85,7 +85,7 @@ class Scraper(object):
allow_errors = () allow_errors = ()
# HTTP session for configuration & cookies # HTTP session for configuration & cookies
session = requests_session() session = http.default_session
@classmethod @classmethod
def getmodules(cls): def getmodules(cls):

View file

@ -1,16 +1,18 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2017 Tobias Gruetzmacher # Copyright (C) 2015-2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import os import os
from distutils.version import StrictVersion
import dosagelib import dosagelib
from dosagelib import configuration from dosagelib import configuration
from .util import urlopen from . import http
from distutils.version import StrictVersion
import requests
UPDATE_URL = "https://api.github.com/repos/webcomics/dosage/releases/latest" UPDATE_URL = "https://api.github.com/repos/webcomics/dosage/releases/latest"
@ -38,13 +40,14 @@ def check_update():
def get_online_version(): def get_online_version():
"""Download update info and parse it.""" """Download update info and parse it."""
session = requests.session() page = http.default_session.get(UPDATE_URL).json()
page = urlopen(UPDATE_URL, session).json()
version, url = None, None version, url = None, None
version = page['tag_name'] version = page['tag_name']
if os.name == 'nt': if os.name == 'nt':
url = next((x['browser_download_url'] for x in page['assets'] if x['content_type'] == 'application/x-msdos-program'), configuration.Url) url = next((x['browser_download_url'] for x in page['assets'] if
x['content_type'] == 'application/x-msdos-program'),
configuration.Url)
else: else:
url = page['tarball_url'] url = page['tarball_url']
return version, url return version, url

View file

@ -9,8 +9,6 @@ from six.moves.urllib.parse import (
quote as url_quote, unquote as url_unquote, urlparse, urlunparse, urlsplit) quote as url_quote, unquote as url_unquote, urlparse, urlunparse, urlsplit)
from six.moves.urllib_robotparser import RobotFileParser from six.moves.urllib_robotparser import RobotFileParser
import requests import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import sys import sys
import os import os
import cgi import cgi
@ -40,16 +38,6 @@ from . import AppName
# Maximum content size for HTML pages # Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 3 # 3 MB MaxContentBytes = 1024 * 1024 * 3 # 3 MB
# Default number of retries
MaxRetries = 3
# Factor for retry backoff (see urllib3.util.retry, this default means
# 2s, 4s, 8s)
RetryBackoffFactor = 2
# Default connection timeout
ConnectionTimeoutSecs = 60
# The character set to encode non-ASCII characters in a URL. See also # The character set to encode non-ASCII characters in a URL. See also
# http://tools.ietf.org/html/rfc2396#section-2.1 # http://tools.ietf.org/html/rfc2396#section-2.1
# Note that the encoding is not really specified, but most browsers # Note that the encoding is not really specified, but most browsers
@ -59,15 +47,6 @@ ConnectionTimeoutSecs = 60
UrlEncoding = "utf-8" UrlEncoding = "utf-8"
def requests_session():
s = requests.Session()
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
s.mount('http://', HTTPAdapter(max_retries=retry))
s.mount('https://', HTTPAdapter(max_retries=retry))
s.headers.update({'User-Agent': UserAgent})
return s
def get_system_uid(): def get_system_uid():
"""Get a (probably) unique ID to identify a system. """Get a (probably) unique ID to identify a system.
Used to differentiate votes. Used to differentiate votes.
@ -285,8 +264,6 @@ def urlopen(url, session, referrer=None, max_content_bytes=None,
kwargs['headers']['Referer'] = referrer kwargs['headers']['Referer'] = referrer
out.debug(u'Sending headers %s' % kwargs['headers'], level=3) out.debug(u'Sending headers %s' % kwargs['headers'], level=3)
out.debug(u'Sending cookies %s' % session.cookies) out.debug(u'Sending cookies %s' % session.cookies)
if 'timeout' not in kwargs:
kwargs['timeout'] = ConnectionTimeoutSecs
if 'data' not in kwargs: if 'data' not in kwargs:
method = 'GET' method = 'GET'
else: else:

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2017 Tobias Gruetzmacher # Copyright (C) 2015-2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -17,13 +17,12 @@ try:
except ImportError: except ImportError:
from os import rename from os import rename
import requests
from lxml import html from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import unescape, get_page from dosagelib.util import unescape, get_page
from dosagelib import scraper from dosagelib import scraper, http
def first_lower(x): def first_lower(x):
@ -39,7 +38,7 @@ class ComicListUpdater(object):
def __init__(self, name): def __init__(self, name):
self.json = name.replace(".py", ".json") self.json = name.replace(".py", ".json")
self.session = requests.Session() self.session = http.default_session
self.sleep = 0 self.sleep = 0
def get_url(self, url, expand=True): def get_url(self, url, expand=True):