Try to inform the user about geo-blocks
Instead of letting the crawler run into "random" error messages, throw a specific "geoblocked" exception instead.
This commit is contained in:
parent
e34a0b539c
commit
7e040086b6
4 changed files with 43 additions and 1 deletions
|
@ -6,6 +6,8 @@
|
||||||
from re import compile, escape, IGNORECASE, sub
|
from re import compile, escape, IGNORECASE, sub
|
||||||
from os.path import splitext
|
from os.path import splitext
|
||||||
|
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
|
|
||||||
from ..scraper import _BasicScraper, _ParserScraper
|
from ..scraper import _BasicScraper, _ParserScraper
|
||||||
from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer
|
from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
|
@ -385,6 +387,16 @@ class SoloLeveling(_ParserScraper):
|
||||||
self.imageUrls = [self.imageUrlModifier(x, data) for x in self.imageUrls]
|
self.imageUrls = [self.imageUrlModifier(x, data) for x in self.imageUrls]
|
||||||
return self.imageUrls
|
return self.imageUrls
|
||||||
|
|
||||||
|
def getPage(self, url):
|
||||||
|
try:
|
||||||
|
return super().getPage(url)
|
||||||
|
except HTTPError as e:
|
||||||
|
# CloudFlare WAF
|
||||||
|
if e.response.status_code == 403 and '1020' in e.response.text:
|
||||||
|
self.geoblocked()
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
return self.stripUrl % str(int(url.strip('/').rsplit('-', 1)[-1]) - 1)
|
return self.stripUrl % str(int(url.strip('/').rsplit('-', 1)[-1]) - 1)
|
||||||
|
|
||||||
|
|
|
@ -29,6 +29,9 @@ class SmackJeeves(_ParserScraper):
|
||||||
response = self.session.post(self.apiBase + 'articleList',
|
response = self.session.post(self.apiBase + 'articleList',
|
||||||
params={'titleNo': self._comicid})
|
params={'titleNo': self._comicid})
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
if ('text/html' in response.headers['content-type'] and
|
||||||
|
'available in your area' in response.text):
|
||||||
|
self.geoblocked()
|
||||||
return response.json()['result']['list'][self.lastid]['articleUrl']
|
return response.json()['result']['list'][self.lastid]['articleUrl']
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlsearch):
|
def fetchUrls(self, url, data, urlsearch):
|
||||||
|
|
|
@ -32,6 +32,11 @@ from .xml import NS
|
||||||
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
||||||
|
|
||||||
|
|
||||||
|
class GeoblockedException(IOError):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__(f'It seems your current location is geo-blocked.')
|
||||||
|
|
||||||
|
|
||||||
class Scraper(object):
|
class Scraper(object):
|
||||||
'''Base class for all comic scraper, but without a specific scrape
|
'''Base class for all comic scraper, but without a specific scrape
|
||||||
implementation.'''
|
implementation.'''
|
||||||
|
@ -346,6 +351,10 @@ class Scraper(object):
|
||||||
pass
|
pass
|
||||||
return lang
|
return lang
|
||||||
|
|
||||||
|
def geoblocked(self):
|
||||||
|
"""Helper method to indicate that the user is most probably geo-blocked."""
|
||||||
|
raise GeoblockedException()
|
||||||
|
|
||||||
|
|
||||||
class _BasicScraper(Scraper):
|
class _BasicScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -7,7 +7,9 @@ import responses
|
||||||
|
|
||||||
import dosagelib.cmd
|
import dosagelib.cmd
|
||||||
import httpmocks
|
import httpmocks
|
||||||
|
from dosagelib.plugins.s import SoloLeveling
|
||||||
|
from dosagelib.plugins.smackjeeves import SmackJeeves
|
||||||
|
from dosagelib.scraper import GeoblockedException
|
||||||
|
|
||||||
def cmd(*options):
|
def cmd(*options):
|
||||||
"""'Fake' run dosage with given options."""
|
"""'Fake' run dosage with given options."""
|
||||||
|
@ -39,3 +41,19 @@ class TestModules(object):
|
||||||
|
|
||||||
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol')
|
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol')
|
||||||
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22')
|
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22')
|
||||||
|
|
||||||
|
@responses.activate
|
||||||
|
def test_smackjeeves_geoblock(self, tmpdir):
|
||||||
|
responses.add(responses.POST, re.compile('https://www.smackjeeves.com/api/.*'),
|
||||||
|
'is not currently available in your area', content_type='text/html')
|
||||||
|
|
||||||
|
with pytest.raises(GeoblockedException):
|
||||||
|
next(SmackJeeves.getmodules()[0].getStrips(1))
|
||||||
|
|
||||||
|
@responses.activate
|
||||||
|
def test_sololeveling_geoblock(self, tmpdir):
|
||||||
|
responses.add(responses.GET, 'https://w1.sololeveling.net/',
|
||||||
|
'<span>1020</span>', status=403)
|
||||||
|
|
||||||
|
with pytest.raises(GeoblockedException):
|
||||||
|
next(SoloLeveling.getmodules()[0].getStrips(1))
|
||||||
|
|
Loading…
Reference in a new issue