From 7e040086b69c1156c8175d6eb43d7429c079da41 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Mon, 28 Sep 2020 13:11:34 +0200 Subject: [PATCH] Try to inform the user about geo-blocks Instead of letting the crawler run into "random" error messages, throw a specific "geoblocked" exception instead. --- dosagelib/plugins/s.py | 12 ++++++++++++ dosagelib/plugins/smackjeeves.py | 3 +++ dosagelib/scraper.py | 9 +++++++++ tests/test_modules.py | 20 +++++++++++++++++++- 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index 548a87858..e4d003426 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -6,6 +6,8 @@ from re import compile, escape, IGNORECASE, sub from os.path import splitext +from requests.exceptions import HTTPError + from ..scraper import _BasicScraper, _ParserScraper from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer from ..util import tagre @@ -385,6 +387,16 @@ class SoloLeveling(_ParserScraper): self.imageUrls = [self.imageUrlModifier(x, data) for x in self.imageUrls] return self.imageUrls + def getPage(self, url): + try: + return super().getPage(url) + except HTTPError as e: + # CloudFlare WAF + if e.response.status_code == 403 and '1020' in e.response.text: + self.geoblocked() + else: + raise e + def getPrevUrl(self, url, data): return self.stripUrl % str(int(url.strip('/').rsplit('-', 1)[-1]) - 1) diff --git a/dosagelib/plugins/smackjeeves.py b/dosagelib/plugins/smackjeeves.py index 78d16b176..69a13c553 100644 --- a/dosagelib/plugins/smackjeeves.py +++ b/dosagelib/plugins/smackjeeves.py @@ -29,6 +29,9 @@ class SmackJeeves(_ParserScraper): response = self.session.post(self.apiBase + 'articleList', params={'titleNo': self._comicid}) response.raise_for_status() + if ('text/html' in response.headers['content-type'] and + 'available in your area' in response.text): + self.geoblocked() return response.json()['result']['list'][self.lastid]['articleUrl'] def fetchUrls(self, url, data, urlsearch): diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 33b37790c..23f1d2b5c 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -32,6 +32,11 @@ from .xml import NS ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/') +class GeoblockedException(IOError): + def __init__(self): + super().__init__(f'It seems your current location is geo-blocked.') + + class Scraper(object): '''Base class for all comic scraper, but without a specific scrape implementation.''' @@ -346,6 +351,10 @@ class Scraper(object): pass return lang + def geoblocked(self): + """Helper method to indicate that the user is most probably geo-blocked.""" + raise GeoblockedException() + class _BasicScraper(Scraper): """ diff --git a/tests/test_modules.py b/tests/test_modules.py index 0fa85f4bb..b08346231 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -7,7 +7,9 @@ import responses import dosagelib.cmd import httpmocks - +from dosagelib.plugins.s import SoloLeveling +from dosagelib.plugins.smackjeeves import SmackJeeves +from dosagelib.scraper import GeoblockedException def cmd(*options): """'Fake' run dosage with given options.""" @@ -39,3 +41,19 @@ class TestModules(object): cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol') cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22') + + @responses.activate + def test_smackjeeves_geoblock(self, tmpdir): + responses.add(responses.POST, re.compile('https://www.smackjeeves.com/api/.*'), + 'is not currently available in your area', content_type='text/html') + + with pytest.raises(GeoblockedException): + next(SmackJeeves.getmodules()[0].getStrips(1)) + + @responses.activate + def test_sololeveling_geoblock(self, tmpdir): + responses.add(responses.GET, 'https://w1.sololeveling.net/', + '1020', status=403) + + with pytest.raises(GeoblockedException): + next(SoloLeveling.getmodules()[0].getStrips(1))