diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py index 91adcf8b6..30f1b82e9 100644 --- a/dosagelib/plugins/a.py +++ b/dosagelib/plugins/a.py @@ -245,6 +245,7 @@ class AppleGeeks(_BasicScraper): firstStripUrl = stripUrl % '1' imageSearch = compile(tagre("img", "src", r'((?:/comics/)?issue\d+\.jpg)')) prevSearch = compile(r'
Previous Comic
\s*

', MULTILINE) + allow_errors = (404,) help = 'Index format: n (unpadded)' diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index eefa744e3..16125a6c8 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -174,6 +174,7 @@ class SexyLosers(_ParserScraper): class Sharksplode(_WordPressScraper): url = 'http://sharksplode.com/' textSearch = '//div[@id="comic"]//img/@alt' + allow_errors = (403,) class Sheldon(_BasicScraper): @@ -324,6 +325,7 @@ class Sorcery101(_ParserScraper): prevSearch = '//a[@rel="prev"]' latestSearch = '//a[%s]' % xpath_class('last-webcomic-link') starter = indirectStarter + allow_errors = (500,) help = 'Index format: stripname' diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index b4ef59762..21099540b 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -79,6 +79,12 @@ class Scraper(object): # usually the index format help help = '' + # Specifing a list of HTTP error codes which should be handled as a + # successful request. This is a workaround for some comics which return + # regular pages with strange HTTP codes. By default, all HTTP errors raise + # exceptions. + allow_errors = () + # HTTP session for configuration & cookies session = requests_session() @@ -306,7 +312,7 @@ class Scraper(object): methods should be able to use the data if they so desire... (Affected methods: shouldSkipUrl, imageUrlModifier) """ - raise ValueError("No implementation for getPage!") + return get_page(url, self.session, allow_errors=self.allow_errors) def fetchUrls(self, url, data, urlsearch): raise ValueError("No implementation for fetchUrls!") @@ -362,7 +368,7 @@ class _BasicScraper(Scraper): BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)')) def getPage(self, url): - content = get_page(url, self.session).text + content = super(_BasicScraper, self).getPage(url).text # determine base URL baseUrl = None match = self.BASE_SEARCH.search(content) @@ -449,7 +455,7 @@ class _ParserScraper(Scraper): broken_html_bugfix = False def getPage(self, url): - page = get_page(url, self.session) + page = super(_ParserScraper, self).getPage(url) if page.encoding: # Requests figured out the encoding, so we can deliver Unicode to # LXML. Unfortunatly, LXML feels betrayed if there is still an XML diff --git a/dosagelib/util.py b/dosagelib/util.py index 1326ec088..35086dfe3 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -18,7 +18,9 @@ import re import traceback import time import subprocess + from six.moves.html_parser import HTMLParser +from six.moves import range import six try: @@ -172,11 +174,11 @@ def case_insensitive_re(name): return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name) -def get_page(url, session, max_content_bytes=MaxContentBytes): +def get_page(url, session, **kwargs): """Get text content of given URL.""" check_robotstxt(url, session) # read page data - page = urlopen(url, session, max_content_bytes=max_content_bytes) + page = urlopen(url, session, max_content_bytes=MaxContentBytes, **kwargs) out.debug(u"Got page content %r" % page.content, level=3) return page @@ -257,7 +259,7 @@ def get_robotstxt_parser(url, session=None): rp = RobotFileParser() try: req = urlopen(url, session, max_content_bytes=MaxContentBytes, - raise_for_status=False) + allow_errors=range(600)) except Exception: # connect or timeout errors are treated as an absent robots.txt rp.allow_all = True @@ -270,7 +272,7 @@ def get_robotstxt_parser(url, session=None): def urlopen(url, session, referrer=None, max_content_bytes=None, - raise_for_status=True, useragent=UserAgent, **kwargs): + allow_errors=(), useragent=UserAgent, **kwargs): """Open an URL and return the response object.""" out.debug(u'Open URL %s' % url) if 'headers' not in kwargs: @@ -291,7 +293,7 @@ def urlopen(url, session, referrer=None, max_content_bytes=None, req = session.request(method, url, **kwargs) out.debug(u'Response cookies: %s' % req.cookies) check_content_size(url, req.headers, max_content_bytes) - if raise_for_status: + if req.status_code not in allow_errors: req.raise_for_status() return req except requests.exceptions.RequestException as err: