Try to inform the user about geo-blocks

Instead of letting the crawler run into "random" error messages, throw a
specific "geoblocked" exception instead.
This commit is contained in:
Tobias Gruetzmacher 2020-09-28 13:11:34 +02:00
parent e34a0b539c
commit 7e040086b6
4 changed files with 43 additions and 1 deletions

View file

@ -6,6 +6,8 @@
from re import compile, escape, IGNORECASE, sub from re import compile, escape, IGNORECASE, sub
from os.path import splitext from os.path import splitext
from requests.exceptions import HTTPError
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer
from ..util import tagre from ..util import tagre
@ -385,6 +387,16 @@ class SoloLeveling(_ParserScraper):
self.imageUrls = [self.imageUrlModifier(x, data) for x in self.imageUrls] self.imageUrls = [self.imageUrlModifier(x, data) for x in self.imageUrls]
return self.imageUrls return self.imageUrls
def getPage(self, url):
try:
return super().getPage(url)
except HTTPError as e:
# CloudFlare WAF
if e.response.status_code == 403 and '1020' in e.response.text:
self.geoblocked()
else:
raise e
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
return self.stripUrl % str(int(url.strip('/').rsplit('-', 1)[-1]) - 1) return self.stripUrl % str(int(url.strip('/').rsplit('-', 1)[-1]) - 1)

View file

@ -29,6 +29,9 @@ class SmackJeeves(_ParserScraper):
response = self.session.post(self.apiBase + 'articleList', response = self.session.post(self.apiBase + 'articleList',
params={'titleNo': self._comicid}) params={'titleNo': self._comicid})
response.raise_for_status() response.raise_for_status()
if ('text/html' in response.headers['content-type'] and
'available in your area' in response.text):
self.geoblocked()
return response.json()['result']['list'][self.lastid]['articleUrl'] return response.json()['result']['list'][self.lastid]['articleUrl']
def fetchUrls(self, url, data, urlsearch): def fetchUrls(self, url, data, urlsearch):

View file

@ -32,6 +32,11 @@ from .xml import NS
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/') ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
class GeoblockedException(IOError):
def __init__(self):
super().__init__(f'It seems your current location is geo-blocked.')
class Scraper(object): class Scraper(object):
'''Base class for all comic scraper, but without a specific scrape '''Base class for all comic scraper, but without a specific scrape
implementation.''' implementation.'''
@ -346,6 +351,10 @@ class Scraper(object):
pass pass
return lang return lang
def geoblocked(self):
"""Helper method to indicate that the user is most probably geo-blocked."""
raise GeoblockedException()
class _BasicScraper(Scraper): class _BasicScraper(Scraper):
""" """

View file

@ -7,7 +7,9 @@ import responses
import dosagelib.cmd import dosagelib.cmd
import httpmocks import httpmocks
from dosagelib.plugins.s import SoloLeveling
from dosagelib.plugins.smackjeeves import SmackJeeves
from dosagelib.scraper import GeoblockedException
def cmd(*options): def cmd(*options):
"""'Fake' run dosage with given options.""" """'Fake' run dosage with given options."""
@ -39,3 +41,19 @@ class TestModules(object):
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol') cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol')
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22') cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22')
@responses.activate
def test_smackjeeves_geoblock(self, tmpdir):
responses.add(responses.POST, re.compile('https://www.smackjeeves.com/api/.*'),
'is not currently available in your area', content_type='text/html')
with pytest.raises(GeoblockedException):
next(SmackJeeves.getmodules()[0].getStrips(1))
@responses.activate
def test_sololeveling_geoblock(self, tmpdir):
responses.add(responses.GET, 'https://w1.sololeveling.net/',
'<span>1020</span>', status=403)
with pytest.raises(GeoblockedException):
next(SoloLeveling.getmodules()[0].getStrips(1))