diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py
index 91adcf8b6..30f1b82e9 100644
--- a/dosagelib/plugins/a.py
+++ b/dosagelib/plugins/a.py
@@ -245,6 +245,7 @@ class AppleGeeks(_BasicScraper):
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'((?:/comics/)?issue\d+\.jpg)'))
prevSearch = compile(r'
Previous Comic
\s*', MULTILINE)
+ allow_errors = (404,)
help = 'Index format: n (unpadded)'
diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py
index eefa744e3..16125a6c8 100644
--- a/dosagelib/plugins/s.py
+++ b/dosagelib/plugins/s.py
@@ -174,6 +174,7 @@ class SexyLosers(_ParserScraper):
class Sharksplode(_WordPressScraper):
url = 'http://sharksplode.com/'
textSearch = '//div[@id="comic"]//img/@alt'
+ allow_errors = (403,)
class Sheldon(_BasicScraper):
@@ -324,6 +325,7 @@ class Sorcery101(_ParserScraper):
prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[%s]' % xpath_class('last-webcomic-link')
starter = indirectStarter
+ allow_errors = (500,)
help = 'Index format: stripname'
diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py
index b4ef59762..21099540b 100644
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@@ -79,6 +79,12 @@ class Scraper(object):
# usually the index format help
help = ''
+ # Specifing a list of HTTP error codes which should be handled as a
+ # successful request. This is a workaround for some comics which return
+ # regular pages with strange HTTP codes. By default, all HTTP errors raise
+ # exceptions.
+ allow_errors = ()
+
# HTTP session for configuration & cookies
session = requests_session()
@@ -306,7 +312,7 @@ class Scraper(object):
methods should be able to use the data if they so desire... (Affected
methods: shouldSkipUrl, imageUrlModifier)
"""
- raise ValueError("No implementation for getPage!")
+ return get_page(url, self.session, allow_errors=self.allow_errors)
def fetchUrls(self, url, data, urlsearch):
raise ValueError("No implementation for fetchUrls!")
@@ -362,7 +368,7 @@ class _BasicScraper(Scraper):
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
def getPage(self, url):
- content = get_page(url, self.session).text
+ content = super(_BasicScraper, self).getPage(url).text
# determine base URL
baseUrl = None
match = self.BASE_SEARCH.search(content)
@@ -449,7 +455,7 @@ class _ParserScraper(Scraper):
broken_html_bugfix = False
def getPage(self, url):
- page = get_page(url, self.session)
+ page = super(_ParserScraper, self).getPage(url)
if page.encoding:
# Requests figured out the encoding, so we can deliver Unicode to
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML
diff --git a/dosagelib/util.py b/dosagelib/util.py
index 1326ec088..35086dfe3 100644
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@@ -18,7 +18,9 @@ import re
import traceback
import time
import subprocess
+
from six.moves.html_parser import HTMLParser
+from six.moves import range
import six
try:
@@ -172,11 +174,11 @@ def case_insensitive_re(name):
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
-def get_page(url, session, max_content_bytes=MaxContentBytes):
+def get_page(url, session, **kwargs):
"""Get text content of given URL."""
check_robotstxt(url, session)
# read page data
- page = urlopen(url, session, max_content_bytes=max_content_bytes)
+ page = urlopen(url, session, max_content_bytes=MaxContentBytes, **kwargs)
out.debug(u"Got page content %r" % page.content, level=3)
return page
@@ -257,7 +259,7 @@ def get_robotstxt_parser(url, session=None):
rp = RobotFileParser()
try:
req = urlopen(url, session, max_content_bytes=MaxContentBytes,
- raise_for_status=False)
+ allow_errors=range(600))
except Exception:
# connect or timeout errors are treated as an absent robots.txt
rp.allow_all = True
@@ -270,7 +272,7 @@ def get_robotstxt_parser(url, session=None):
def urlopen(url, session, referrer=None, max_content_bytes=None,
- raise_for_status=True, useragent=UserAgent, **kwargs):
+ allow_errors=(), useragent=UserAgent, **kwargs):
"""Open an URL and return the response object."""
out.debug(u'Open URL %s' % url)
if 'headers' not in kwargs:
@@ -291,7 +293,7 @@ def urlopen(url, session, referrer=None, max_content_bytes=None,
req = session.request(method, url, **kwargs)
out.debug(u'Response cookies: %s' % req.cookies)
check_content_size(url, req.headers, max_content_bytes)
- if raise_for_status:
+ if req.status_code not in allow_errors:
req.raise_for_status()
return req
except requests.exceptions.RequestException as err: