Merge branch 'host-based-delay'

This commit is contained in:
Tobias Gruetzmacher 2019-12-04 00:29:11 +01:00
commit 1501055513
9 changed files with 118 additions and 60 deletions

75
dosagelib/http.py Normal file
View file

@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from collections import defaultdict
from random import uniform
from time import time, sleep
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from six.moves.urllib.parse import urlparse
from .configuration import UserAgent
# Default number of retries
MaxRetries = 3
# Factor for retry backoff (see urllib3.util.retry, this default means
# 2s, 4s, 8s)
RetryBackoffFactor = 2
# Default connection timeout
ConnectionTimeoutSecs = 60
class Session(requests.Session):
"""This session implements a very simple host-based throttling system: For
each hostname we see, we keep a record on when is the earliest time we want
to send the next request: If before sending a request this time isn't
reached, we sleep a bit until the requirements are satisfied. By default,
we only delay a random amount of at most 0.3sec - but some hosts might need
longer delays.
"""
def __init__(self):
super(Session, self).__init__()
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
self.mount('http://', HTTPAdapter(max_retries=retry))
self.mount('https://', HTTPAdapter(max_retries=retry))
self.headers.update({'User-Agent': UserAgent})
self.throttles = defaultdict(lambda: RandomThrottle(0.0, 0.3))
def send(self, request, **kwargs):
if 'timeout' not in kwargs:
kwargs['timeout'] = ConnectionTimeoutSecs
hostname = urlparse(request.url).hostname
self.throttles[hostname].delay()
return super(Session, self).send(request, **kwargs)
def add_throttle(self, hostname, th_min, th_max):
"""Adds a new throttle for a host: Might overwrite the existing one.
"""
self.throttles[hostname] = RandomThrottle(th_min, th_max)
class RandomThrottle(object):
def __init__(self, th_min, th_max):
self.th_min = th_min
self.th_max = th_max
self.next = time()
def delay(self):
d = self.next - time()
if d > 0:
sleep(d)
self.next = time() + uniform(self.th_min, self.th_max)
# A default session for cookie and connection sharing
default_session = Session()

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2018 Tobias Gruetzmacher
# Copyright (C) 2015-2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -10,7 +10,7 @@ from ..helpers import indirectStarter, xpath_class
class GoComics(_ParserScraper):
url = 'http://www.gocomics.com/'
url = 'https://www.gocomics.com/'
imageSearch = '//picture[{}]/img'.format(xpath_class('item-comic-image'))
prevSearch = '//a[{}]'.format(xpath_class('js-previous-comic'))
latestSearch = '//div[{}]//a'.format(xpath_class('gc-deck--cta-0'))
@ -19,7 +19,8 @@ class GoComics(_ParserScraper):
def __init__(self, name, path, lang=None):
super(GoComics, self).__init__('GoComics/' + name)
self.url = 'http://www.gocomics.com/' + path
self.session.add_throttle('www.gocomics.com', 1.0, 2.0)
self.url = 'https://www.gocomics.com/' + path
self.shortname = name
if lang:
self.lang = lang

View file

@ -5,8 +5,6 @@
from __future__ import absolute_import, division, print_function
import time
import random
import os
import re
from six.moves.urllib.parse import urljoin
@ -24,9 +22,9 @@ try:
except ImportError:
pycountry = None
from . import loader, configuration, languages
from . import configuration, http, languages, loader
from .util import (get_page, makeSequence, get_system_uid, unescape, tagre,
normaliseURL, prettyMatcherList, requests_session, uniq)
normaliseURL, prettyMatcherList, uniq)
from .comic import ComicStrip
from .output import out
from .events import getHandler
@ -85,7 +83,7 @@ class Scraper(object):
allow_errors = ()
# HTTP session for configuration & cookies
session = requests_session()
session = http.default_session
@classmethod
def getmodules(cls):
@ -200,9 +198,6 @@ class Scraper(object):
out.warn(u"Already seen previous URL %r" % prevUrl)
break
url = prevUrl
if url:
# wait up to 2 seconds for next URL
time.sleep(1.0 + random.random())
def getPrevUrl(self, url, data):
"""Find previous URL."""

View file

@ -1,16 +1,18 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2017 Tobias Gruetzmacher
# Copyright (C) 2015-2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
import os
from distutils.version import StrictVersion
import dosagelib
from dosagelib import configuration
from .util import urlopen
from distutils.version import StrictVersion
import requests
from . import http
UPDATE_URL = "https://api.github.com/repos/webcomics/dosage/releases/latest"
@ -38,13 +40,14 @@ def check_update():
def get_online_version():
"""Download update info and parse it."""
session = requests.session()
page = urlopen(UPDATE_URL, session).json()
page = http.default_session.get(UPDATE_URL).json()
version, url = None, None
version = page['tag_name']
if os.name == 'nt':
url = next((x['browser_download_url'] for x in page['assets'] if x['content_type'] == 'application/x-msdos-program'), configuration.Url)
url = next((x['browser_download_url'] for x in page['assets'] if
x['content_type'] == 'application/x-msdos-program'),
configuration.Url)
else:
url = page['tarball_url']
return version, url

View file

@ -9,8 +9,6 @@ from six.moves.urllib.parse import (
quote as url_quote, unquote as url_unquote, urlparse, urlunparse, urlsplit)
from six.moves.urllib_robotparser import RobotFileParser
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import sys
import os
import cgi
@ -40,16 +38,6 @@ from . import AppName
# Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
# Default number of retries
MaxRetries = 3
# Factor for retry backoff (see urllib3.util.retry, this default means
# 2s, 4s, 8s)
RetryBackoffFactor = 2
# Default connection timeout
ConnectionTimeoutSecs = 60
# The character set to encode non-ASCII characters in a URL. See also
# http://tools.ietf.org/html/rfc2396#section-2.1
# Note that the encoding is not really specified, but most browsers
@ -59,15 +47,6 @@ ConnectionTimeoutSecs = 60
UrlEncoding = "utf-8"
def requests_session():
s = requests.Session()
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
s.mount('http://', HTTPAdapter(max_retries=retry))
s.mount('https://', HTTPAdapter(max_retries=retry))
s.headers.update({'User-Agent': UserAgent})
return s
def get_system_uid():
"""Get a (probably) unique ID to identify a system.
Used to differentiate votes.
@ -285,8 +264,6 @@ def urlopen(url, session, referrer=None, max_content_bytes=None,
kwargs['headers']['Referer'] = referrer
out.debug(u'Sending headers %s' % kwargs['headers'], level=3)
out.debug(u'Sending cookies %s' % session.cookies)
if 'timeout' not in kwargs:
kwargs['timeout'] = ConnectionTimeoutSecs
if 'data' not in kwargs:
method = 'GET'
else:

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2017 Tobias Gruetzmacher
# Copyright (C) 2015-2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -17,13 +17,12 @@ try:
except ImportError:
from os import rename
import requests
from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import unescape, get_page
from dosagelib import scraper
from dosagelib import scraper, http
def first_lower(x):
@ -39,7 +38,7 @@ class ComicListUpdater(object):
def __init__(self, name):
self.json = name.replace(".py", ".json")
self.session = requests.Session()
self.session = http.default_session
self.sleep = 0
def get_url(self, url, expand=True):

View file

@ -55,6 +55,7 @@ bash =
argcomplete
test =
pytest-cov
pytest-xdist
responses
[bdist_wheel]

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2018 Tobias Gruetzmacher
# Copyright (C) 2015-2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -10,15 +10,10 @@ import multiprocessing
from six.moves.urllib.parse import urlsplit
def get_host(url):
"""Get host part of URL."""
return urlsplit(url)[1].lower()
# Dictionary with per-host locks.
_locks = {}
# Allowed number of connections per host
MaxConnections = 4
MaxConnections = 2
# Maximum number of strips to get to test a comic
MaxStrips = 5
@ -30,17 +25,13 @@ def get_lock(host):
return _locks[host]
def test_comicmodule(tmpdir, scraperobj):
def test_comicmodule(tmpdir, scraperobj, worker_id):
'''Test a scraper. It must be able to traverse backward for at least 5
strips from the start, and find strip images on at least 4 pages.'''
# Limit number of connections to one host.
host = get_host(scraperobj.url)
try:
host = urlsplit(scraperobj.url).hostname
with get_lock(host):
_test_comic(str(tmpdir), scraperobj)
except OSError:
# interprocess lock not supported
_test_comic(str(tmpdir), scraperobj)
def _test_comic(outdir, scraperobj):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
# Copyright (C) 2015-2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -9,6 +9,9 @@ import re
import operator
import os
import pytest
from xdist.dsession import LoadScopeScheduling
from dosagelib import scraper
@ -45,3 +48,16 @@ def pytest_generate_tests(metafunc):
if 'scraperobj' in metafunc.fixturenames:
metafunc.parametrize('scraperobj', get_test_scrapers(),
ids=operator.attrgetter('name'))
class LoadModScheduling(LoadScopeScheduling):
"""Implement load scheduling for comic modules. See xdist for details."""
def _split_scope(self, nodeid):
mod, test = nodeid.split("::", 1)
return mod + "::" + test.split("/", 1)[0]
@pytest.mark.trylast
def pytest_xdist_make_scheduler(config, log):
return LoadModScheduling(config, log)