Allow modules to ignore some HTTP error codes.

This is neccessary since it seems some webservers out there are misconfigured to deliver actual content with an HTTP error code...
2016-11-01 18:25:02 +01:00 · 2016-11-01 18:25:02 +01:00 · 3f9feec041
commit 3f9feec041
parent 46b7a374f6
4 changed files with 19 additions and 8 deletions
--- a/dosagelib/plugins/a.py
+++ b/dosagelib/plugins/a.py
@ -245,6 +245,7 @@ class AppleGeeks(_BasicScraper):
    firstStripUrl = stripUrl % '1'
    imageSearch = compile(tagre("img", "src", r'((?:/comics/)?issue\d+\.jpg)'))
    prevSearch = compile(r'<div class="caption">Previous Comic</div>\s*<p><a href="([^"]+)">', MULTILINE)
+    allow_errors = (404,)
    help = 'Index format: n (unpadded)'


--- a/dosagelib/plugins/s.py
+++ b/dosagelib/plugins/s.py
@ -174,6 +174,7 @@ class SexyLosers(_ParserScraper):
 class Sharksplode(_WordPressScraper):
    url = 'http://sharksplode.com/'
    textSearch = '//div[@id="comic"]//img/@alt'
+    allow_errors = (403,)


 class Sheldon(_BasicScraper):
@ -324,6 +325,7 @@ class Sorcery101(_ParserScraper):
    prevSearch = '//a[@rel="prev"]'
    latestSearch = '//a[%s]' % xpath_class('last-webcomic-link')
    starter = indirectStarter
+    allow_errors = (500,)
    help = 'Index format: stripname'


--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -79,6 +79,12 @@ class Scraper(object):
    # usually the index format help
    help = ''

+    # Specifing a list of HTTP error codes which should be handled as a
+    # successful request.  This is a workaround for some comics which return
+    # regular pages with strange HTTP codes. By default, all HTTP errors raise
+    # exceptions.
+    allow_errors = ()
+
    # HTTP session for configuration & cookies
    session = requests_session()

@ -306,7 +312,7 @@ class Scraper(object):
        methods should be able to use the data if they so desire... (Affected
        methods: shouldSkipUrl, imageUrlModifier)
        """
-        raise ValueError("No implementation for getPage!")
+        return get_page(url, self.session, allow_errors=self.allow_errors)

    def fetchUrls(self, url, data, urlsearch):
        raise ValueError("No implementation for fetchUrls!")
@ -362,7 +368,7 @@ class _BasicScraper(Scraper):
    BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))

    def getPage(self, url):
-        content = get_page(url, self.session).text
+        content = super(_BasicScraper, self).getPage(url).text
        # determine base URL
        baseUrl = None
        match = self.BASE_SEARCH.search(content)
@ -449,7 +455,7 @@ class _ParserScraper(Scraper):
    broken_html_bugfix = False

    def getPage(self, url):
-        page = get_page(url, self.session)
+        page = super(_ParserScraper, self).getPage(url)
        if page.encoding:
            # Requests figured out the encoding, so we can deliver Unicode to
            # LXML. Unfortunatly, LXML feels betrayed if there is still an XML
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -18,7 +18,9 @@ import re
 import traceback
 import time
 import subprocess
+
 from six.moves.html_parser import HTMLParser
+from six.moves import range
 import six

 try:
@ -172,11 +174,11 @@ def case_insensitive_re(name):
    return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)


-def get_page(url, session, max_content_bytes=MaxContentBytes):
+def get_page(url, session, **kwargs):
    """Get text content of given URL."""
    check_robotstxt(url, session)
    # read page data
-    page = urlopen(url, session, max_content_bytes=max_content_bytes)
+    page = urlopen(url, session, max_content_bytes=MaxContentBytes, **kwargs)
    out.debug(u"Got page content %r" % page.content, level=3)
    return page

@ -257,7 +259,7 @@ def get_robotstxt_parser(url, session=None):
    rp = RobotFileParser()
    try:
        req = urlopen(url, session, max_content_bytes=MaxContentBytes,
-                      raise_for_status=False)
+                      allow_errors=range(600))
    except Exception:
        # connect or timeout errors are treated as an absent robots.txt
        rp.allow_all = True
@ -270,7 +272,7 @@ def get_robotstxt_parser(url, session=None):


 def urlopen(url, session, referrer=None, max_content_bytes=None,
-            raise_for_status=True, useragent=UserAgent, **kwargs):
+            allow_errors=(), useragent=UserAgent, **kwargs):
    """Open an URL and return the response object."""
    out.debug(u'Open URL %s' % url)
    if 'headers' not in kwargs:
@ -291,7 +293,7 @@ def urlopen(url, session, referrer=None, max_content_bytes=None,
        req = session.request(method, url, **kwargs)
        out.debug(u'Response cookies: %s' % req.cookies)
        check_content_size(url, req.headers, max_content_bytes)
-        if raise_for_status:
+        if req.status_code not in allow_errors:
            req.raise_for_status()
        return req
    except requests.exceptions.RequestException as err: