Try to inform the user about geo-blocks
Instead of letting the crawler run into "random" error messages, throw a specific "geoblocked" exception instead.
This commit is contained in:
parent
e34a0b539c
commit
7e040086b6
4 changed files with 43 additions and 1 deletions
|
@ -6,6 +6,8 @@
|
|||
from re import compile, escape, IGNORECASE, sub
|
||||
from os.path import splitext
|
||||
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer
|
||||
from ..util import tagre
|
||||
|
@ -385,6 +387,16 @@ class SoloLeveling(_ParserScraper):
|
|||
self.imageUrls = [self.imageUrlModifier(x, data) for x in self.imageUrls]
|
||||
return self.imageUrls
|
||||
|
||||
def getPage(self, url):
|
||||
try:
|
||||
return super().getPage(url)
|
||||
except HTTPError as e:
|
||||
# CloudFlare WAF
|
||||
if e.response.status_code == 403 and '1020' in e.response.text:
|
||||
self.geoblocked()
|
||||
else:
|
||||
raise e
|
||||
|
||||
def getPrevUrl(self, url, data):
|
||||
return self.stripUrl % str(int(url.strip('/').rsplit('-', 1)[-1]) - 1)
|
||||
|
||||
|
|
|
@ -29,6 +29,9 @@ class SmackJeeves(_ParserScraper):
|
|||
response = self.session.post(self.apiBase + 'articleList',
|
||||
params={'titleNo': self._comicid})
|
||||
response.raise_for_status()
|
||||
if ('text/html' in response.headers['content-type'] and
|
||||
'available in your area' in response.text):
|
||||
self.geoblocked()
|
||||
return response.json()['result']['list'][self.lastid]['articleUrl']
|
||||
|
||||
def fetchUrls(self, url, data, urlsearch):
|
||||
|
|
|
@ -32,6 +32,11 @@ from .xml import NS
|
|||
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
||||
|
||||
|
||||
class GeoblockedException(IOError):
|
||||
def __init__(self):
|
||||
super().__init__(f'It seems your current location is geo-blocked.')
|
||||
|
||||
|
||||
class Scraper(object):
|
||||
'''Base class for all comic scraper, but without a specific scrape
|
||||
implementation.'''
|
||||
|
@ -346,6 +351,10 @@ class Scraper(object):
|
|||
pass
|
||||
return lang
|
||||
|
||||
def geoblocked(self):
|
||||
"""Helper method to indicate that the user is most probably geo-blocked."""
|
||||
raise GeoblockedException()
|
||||
|
||||
|
||||
class _BasicScraper(Scraper):
|
||||
"""
|
||||
|
|
|
@ -7,7 +7,9 @@ import responses
|
|||
|
||||
import dosagelib.cmd
|
||||
import httpmocks
|
||||
|
||||
from dosagelib.plugins.s import SoloLeveling
|
||||
from dosagelib.plugins.smackjeeves import SmackJeeves
|
||||
from dosagelib.scraper import GeoblockedException
|
||||
|
||||
def cmd(*options):
|
||||
"""'Fake' run dosage with given options."""
|
||||
|
@ -39,3 +41,19 @@ class TestModules(object):
|
|||
|
||||
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol')
|
||||
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22')
|
||||
|
||||
@responses.activate
|
||||
def test_smackjeeves_geoblock(self, tmpdir):
|
||||
responses.add(responses.POST, re.compile('https://www.smackjeeves.com/api/.*'),
|
||||
'is not currently available in your area', content_type='text/html')
|
||||
|
||||
with pytest.raises(GeoblockedException):
|
||||
next(SmackJeeves.getmodules()[0].getStrips(1))
|
||||
|
||||
@responses.activate
|
||||
def test_sololeveling_geoblock(self, tmpdir):
|
||||
responses.add(responses.GET, 'https://w1.sololeveling.net/',
|
||||
'<span>1020</span>', status=403)
|
||||
|
||||
with pytest.raises(GeoblockedException):
|
||||
next(SoloLeveling.getmodules()[0].getStrips(1))
|
||||
|
|
Loading…
Reference in a new issue