Merge branch 'host-based-delay'
This commit is contained in:
commit
1501055513
9 changed files with 118 additions and 60 deletions
75
dosagelib/http.py
Normal file
75
dosagelib/http.py
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2019 Tobias Gruetzmacher
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from random import uniform
|
||||||
|
from time import time, sleep
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from requests.packages.urllib3.util.retry import Retry
|
||||||
|
from six.moves.urllib.parse import urlparse
|
||||||
|
|
||||||
|
from .configuration import UserAgent
|
||||||
|
|
||||||
|
# Default number of retries
|
||||||
|
MaxRetries = 3
|
||||||
|
|
||||||
|
# Factor for retry backoff (see urllib3.util.retry, this default means
|
||||||
|
# 2s, 4s, 8s)
|
||||||
|
RetryBackoffFactor = 2
|
||||||
|
|
||||||
|
# Default connection timeout
|
||||||
|
ConnectionTimeoutSecs = 60
|
||||||
|
|
||||||
|
|
||||||
|
class Session(requests.Session):
|
||||||
|
"""This session implements a very simple host-based throttling system: For
|
||||||
|
each hostname we see, we keep a record on when is the earliest time we want
|
||||||
|
to send the next request: If before sending a request this time isn't
|
||||||
|
reached, we sleep a bit until the requirements are satisfied. By default,
|
||||||
|
we only delay a random amount of at most 0.3sec - but some hosts might need
|
||||||
|
longer delays.
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
super(Session, self).__init__()
|
||||||
|
|
||||||
|
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
|
||||||
|
self.mount('http://', HTTPAdapter(max_retries=retry))
|
||||||
|
self.mount('https://', HTTPAdapter(max_retries=retry))
|
||||||
|
self.headers.update({'User-Agent': UserAgent})
|
||||||
|
|
||||||
|
self.throttles = defaultdict(lambda: RandomThrottle(0.0, 0.3))
|
||||||
|
|
||||||
|
def send(self, request, **kwargs):
|
||||||
|
if 'timeout' not in kwargs:
|
||||||
|
kwargs['timeout'] = ConnectionTimeoutSecs
|
||||||
|
|
||||||
|
hostname = urlparse(request.url).hostname
|
||||||
|
self.throttles[hostname].delay()
|
||||||
|
|
||||||
|
return super(Session, self).send(request, **kwargs)
|
||||||
|
|
||||||
|
def add_throttle(self, hostname, th_min, th_max):
|
||||||
|
"""Adds a new throttle for a host: Might overwrite the existing one.
|
||||||
|
"""
|
||||||
|
self.throttles[hostname] = RandomThrottle(th_min, th_max)
|
||||||
|
|
||||||
|
|
||||||
|
class RandomThrottle(object):
|
||||||
|
def __init__(self, th_min, th_max):
|
||||||
|
self.th_min = th_min
|
||||||
|
self.th_max = th_max
|
||||||
|
self.next = time()
|
||||||
|
|
||||||
|
def delay(self):
|
||||||
|
d = self.next - time()
|
||||||
|
if d > 0:
|
||||||
|
sleep(d)
|
||||||
|
self.next = time() + uniform(self.th_min, self.th_max)
|
||||||
|
|
||||||
|
|
||||||
|
# A default session for cookie and connection sharing
|
||||||
|
default_session = Session()
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2018 Tobias Gruetzmacher
|
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ from ..helpers import indirectStarter, xpath_class
|
||||||
|
|
||||||
|
|
||||||
class GoComics(_ParserScraper):
|
class GoComics(_ParserScraper):
|
||||||
url = 'http://www.gocomics.com/'
|
url = 'https://www.gocomics.com/'
|
||||||
imageSearch = '//picture[{}]/img'.format(xpath_class('item-comic-image'))
|
imageSearch = '//picture[{}]/img'.format(xpath_class('item-comic-image'))
|
||||||
prevSearch = '//a[{}]'.format(xpath_class('js-previous-comic'))
|
prevSearch = '//a[{}]'.format(xpath_class('js-previous-comic'))
|
||||||
latestSearch = '//div[{}]//a'.format(xpath_class('gc-deck--cta-0'))
|
latestSearch = '//div[{}]//a'.format(xpath_class('gc-deck--cta-0'))
|
||||||
|
@ -19,7 +19,8 @@ class GoComics(_ParserScraper):
|
||||||
|
|
||||||
def __init__(self, name, path, lang=None):
|
def __init__(self, name, path, lang=None):
|
||||||
super(GoComics, self).__init__('GoComics/' + name)
|
super(GoComics, self).__init__('GoComics/' + name)
|
||||||
self.url = 'http://www.gocomics.com/' + path
|
self.session.add_throttle('www.gocomics.com', 1.0, 2.0)
|
||||||
|
self.url = 'https://www.gocomics.com/' + path
|
||||||
self.shortname = name
|
self.shortname = name
|
||||||
if lang:
|
if lang:
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
|
|
|
@ -5,8 +5,6 @@
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from six.moves.urllib.parse import urljoin
|
from six.moves.urllib.parse import urljoin
|
||||||
|
@ -24,9 +22,9 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pycountry = None
|
pycountry = None
|
||||||
|
|
||||||
from . import loader, configuration, languages
|
from . import configuration, http, languages, loader
|
||||||
from .util import (get_page, makeSequence, get_system_uid, unescape, tagre,
|
from .util import (get_page, makeSequence, get_system_uid, unescape, tagre,
|
||||||
normaliseURL, prettyMatcherList, requests_session, uniq)
|
normaliseURL, prettyMatcherList, uniq)
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
@ -85,7 +83,7 @@ class Scraper(object):
|
||||||
allow_errors = ()
|
allow_errors = ()
|
||||||
|
|
||||||
# HTTP session for configuration & cookies
|
# HTTP session for configuration & cookies
|
||||||
session = requests_session()
|
session = http.default_session
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getmodules(cls):
|
def getmodules(cls):
|
||||||
|
@ -200,9 +198,6 @@ class Scraper(object):
|
||||||
out.warn(u"Already seen previous URL %r" % prevUrl)
|
out.warn(u"Already seen previous URL %r" % prevUrl)
|
||||||
break
|
break
|
||||||
url = prevUrl
|
url = prevUrl
|
||||||
if url:
|
|
||||||
# wait up to 2 seconds for next URL
|
|
||||||
time.sleep(1.0 + random.random())
|
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
"""Find previous URL."""
|
"""Find previous URL."""
|
||||||
|
|
|
@ -1,16 +1,18 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2017 Tobias Gruetzmacher
|
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from distutils.version import StrictVersion
|
||||||
|
|
||||||
import dosagelib
|
import dosagelib
|
||||||
from dosagelib import configuration
|
from dosagelib import configuration
|
||||||
from .util import urlopen
|
from . import http
|
||||||
from distutils.version import StrictVersion
|
|
||||||
import requests
|
|
||||||
|
|
||||||
UPDATE_URL = "https://api.github.com/repos/webcomics/dosage/releases/latest"
|
UPDATE_URL = "https://api.github.com/repos/webcomics/dosage/releases/latest"
|
||||||
|
|
||||||
|
@ -38,13 +40,14 @@ def check_update():
|
||||||
|
|
||||||
def get_online_version():
|
def get_online_version():
|
||||||
"""Download update info and parse it."""
|
"""Download update info and parse it."""
|
||||||
session = requests.session()
|
page = http.default_session.get(UPDATE_URL).json()
|
||||||
page = urlopen(UPDATE_URL, session).json()
|
|
||||||
version, url = None, None
|
version, url = None, None
|
||||||
version = page['tag_name']
|
version = page['tag_name']
|
||||||
|
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
url = next((x['browser_download_url'] for x in page['assets'] if x['content_type'] == 'application/x-msdos-program'), configuration.Url)
|
url = next((x['browser_download_url'] for x in page['assets'] if
|
||||||
|
x['content_type'] == 'application/x-msdos-program'),
|
||||||
|
configuration.Url)
|
||||||
else:
|
else:
|
||||||
url = page['tarball_url']
|
url = page['tarball_url']
|
||||||
return version, url
|
return version, url
|
||||||
|
|
|
@ -9,8 +9,6 @@ from six.moves.urllib.parse import (
|
||||||
quote as url_quote, unquote as url_unquote, urlparse, urlunparse, urlsplit)
|
quote as url_quote, unquote as url_unquote, urlparse, urlunparse, urlsplit)
|
||||||
from six.moves.urllib_robotparser import RobotFileParser
|
from six.moves.urllib_robotparser import RobotFileParser
|
||||||
import requests
|
import requests
|
||||||
from requests.adapters import HTTPAdapter
|
|
||||||
from requests.packages.urllib3.util.retry import Retry
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import cgi
|
import cgi
|
||||||
|
@ -40,16 +38,6 @@ from . import AppName
|
||||||
# Maximum content size for HTML pages
|
# Maximum content size for HTML pages
|
||||||
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
|
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
|
||||||
|
|
||||||
# Default number of retries
|
|
||||||
MaxRetries = 3
|
|
||||||
|
|
||||||
# Factor for retry backoff (see urllib3.util.retry, this default means
|
|
||||||
# 2s, 4s, 8s)
|
|
||||||
RetryBackoffFactor = 2
|
|
||||||
|
|
||||||
# Default connection timeout
|
|
||||||
ConnectionTimeoutSecs = 60
|
|
||||||
|
|
||||||
# The character set to encode non-ASCII characters in a URL. See also
|
# The character set to encode non-ASCII characters in a URL. See also
|
||||||
# http://tools.ietf.org/html/rfc2396#section-2.1
|
# http://tools.ietf.org/html/rfc2396#section-2.1
|
||||||
# Note that the encoding is not really specified, but most browsers
|
# Note that the encoding is not really specified, but most browsers
|
||||||
|
@ -59,15 +47,6 @@ ConnectionTimeoutSecs = 60
|
||||||
UrlEncoding = "utf-8"
|
UrlEncoding = "utf-8"
|
||||||
|
|
||||||
|
|
||||||
def requests_session():
|
|
||||||
s = requests.Session()
|
|
||||||
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
|
|
||||||
s.mount('http://', HTTPAdapter(max_retries=retry))
|
|
||||||
s.mount('https://', HTTPAdapter(max_retries=retry))
|
|
||||||
s.headers.update({'User-Agent': UserAgent})
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
def get_system_uid():
|
def get_system_uid():
|
||||||
"""Get a (probably) unique ID to identify a system.
|
"""Get a (probably) unique ID to identify a system.
|
||||||
Used to differentiate votes.
|
Used to differentiate votes.
|
||||||
|
@ -285,8 +264,6 @@ def urlopen(url, session, referrer=None, max_content_bytes=None,
|
||||||
kwargs['headers']['Referer'] = referrer
|
kwargs['headers']['Referer'] = referrer
|
||||||
out.debug(u'Sending headers %s' % kwargs['headers'], level=3)
|
out.debug(u'Sending headers %s' % kwargs['headers'], level=3)
|
||||||
out.debug(u'Sending cookies %s' % session.cookies)
|
out.debug(u'Sending cookies %s' % session.cookies)
|
||||||
if 'timeout' not in kwargs:
|
|
||||||
kwargs['timeout'] = ConnectionTimeoutSecs
|
|
||||||
if 'data' not in kwargs:
|
if 'data' not in kwargs:
|
||||||
method = 'GET'
|
method = 'GET'
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2017 Tobias Gruetzmacher
|
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -17,13 +17,12 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from os import rename
|
from os import rename
|
||||||
|
|
||||||
import requests
|
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
|
|
||||||
from dosagelib.util import unescape, get_page
|
from dosagelib.util import unescape, get_page
|
||||||
from dosagelib import scraper
|
from dosagelib import scraper, http
|
||||||
|
|
||||||
|
|
||||||
def first_lower(x):
|
def first_lower(x):
|
||||||
|
@ -39,7 +38,7 @@ class ComicListUpdater(object):
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.json = name.replace(".py", ".json")
|
self.json = name.replace(".py", ".json")
|
||||||
self.session = requests.Session()
|
self.session = http.default_session
|
||||||
self.sleep = 0
|
self.sleep = 0
|
||||||
|
|
||||||
def get_url(self, url, expand=True):
|
def get_url(self, url, expand=True):
|
||||||
|
|
|
@ -55,6 +55,7 @@ bash =
|
||||||
argcomplete
|
argcomplete
|
||||||
test =
|
test =
|
||||||
pytest-cov
|
pytest-cov
|
||||||
|
pytest-xdist
|
||||||
responses
|
responses
|
||||||
|
|
||||||
[bdist_wheel]
|
[bdist_wheel]
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2018 Tobias Gruetzmacher
|
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -10,15 +10,10 @@ import multiprocessing
|
||||||
from six.moves.urllib.parse import urlsplit
|
from six.moves.urllib.parse import urlsplit
|
||||||
|
|
||||||
|
|
||||||
def get_host(url):
|
|
||||||
"""Get host part of URL."""
|
|
||||||
return urlsplit(url)[1].lower()
|
|
||||||
|
|
||||||
|
|
||||||
# Dictionary with per-host locks.
|
# Dictionary with per-host locks.
|
||||||
_locks = {}
|
_locks = {}
|
||||||
# Allowed number of connections per host
|
# Allowed number of connections per host
|
||||||
MaxConnections = 4
|
MaxConnections = 2
|
||||||
# Maximum number of strips to get to test a comic
|
# Maximum number of strips to get to test a comic
|
||||||
MaxStrips = 5
|
MaxStrips = 5
|
||||||
|
|
||||||
|
@ -30,16 +25,12 @@ def get_lock(host):
|
||||||
return _locks[host]
|
return _locks[host]
|
||||||
|
|
||||||
|
|
||||||
def test_comicmodule(tmpdir, scraperobj):
|
def test_comicmodule(tmpdir, scraperobj, worker_id):
|
||||||
'''Test a scraper. It must be able to traverse backward for at least 5
|
'''Test a scraper. It must be able to traverse backward for at least 5
|
||||||
strips from the start, and find strip images on at least 4 pages.'''
|
strips from the start, and find strip images on at least 4 pages.'''
|
||||||
# Limit number of connections to one host.
|
# Limit number of connections to one host.
|
||||||
host = get_host(scraperobj.url)
|
host = urlsplit(scraperobj.url).hostname
|
||||||
try:
|
with get_lock(host):
|
||||||
with get_lock(host):
|
|
||||||
_test_comic(str(tmpdir), scraperobj)
|
|
||||||
except OSError:
|
|
||||||
# interprocess lock not supported
|
|
||||||
_test_comic(str(tmpdir), scraperobj)
|
_test_comic(str(tmpdir), scraperobj)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -9,6 +9,9 @@ import re
|
||||||
import operator
|
import operator
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from xdist.dsession import LoadScopeScheduling
|
||||||
|
|
||||||
from dosagelib import scraper
|
from dosagelib import scraper
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,3 +48,16 @@ def pytest_generate_tests(metafunc):
|
||||||
if 'scraperobj' in metafunc.fixturenames:
|
if 'scraperobj' in metafunc.fixturenames:
|
||||||
metafunc.parametrize('scraperobj', get_test_scrapers(),
|
metafunc.parametrize('scraperobj', get_test_scrapers(),
|
||||||
ids=operator.attrgetter('name'))
|
ids=operator.attrgetter('name'))
|
||||||
|
|
||||||
|
|
||||||
|
class LoadModScheduling(LoadScopeScheduling):
|
||||||
|
"""Implement load scheduling for comic modules. See xdist for details."""
|
||||||
|
|
||||||
|
def _split_scope(self, nodeid):
|
||||||
|
mod, test = nodeid.split("::", 1)
|
||||||
|
return mod + "::" + test.split("/", 1)[0]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.trylast
|
||||||
|
def pytest_xdist_make_scheduler(config, log):
|
||||||
|
return LoadModScheduling(config, log)
|
||||||
|
|
Loading…
Reference in a new issue