Allow modules to ignore some HTTP error codes.

This is neccessary since it seems some webservers out there are
misconfigured to deliver actual content with an HTTP error code...
This commit is contained in:
Tobias Gruetzmacher 2016-11-01 18:25:02 +01:00
parent 46b7a374f6
commit 3f9feec041
4 changed files with 19 additions and 8 deletions

View file

@ -245,6 +245,7 @@ class AppleGeeks(_BasicScraper):
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'((?:/comics/)?issue\d+\.jpg)')) imageSearch = compile(tagre("img", "src", r'((?:/comics/)?issue\d+\.jpg)'))
prevSearch = compile(r'<div class="caption">Previous Comic</div>\s*<p><a href="([^"]+)">', MULTILINE) prevSearch = compile(r'<div class="caption">Previous Comic</div>\s*<p><a href="([^"]+)">', MULTILINE)
allow_errors = (404,)
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'

View file

@ -174,6 +174,7 @@ class SexyLosers(_ParserScraper):
class Sharksplode(_WordPressScraper): class Sharksplode(_WordPressScraper):
url = 'http://sharksplode.com/' url = 'http://sharksplode.com/'
textSearch = '//div[@id="comic"]//img/@alt' textSearch = '//div[@id="comic"]//img/@alt'
allow_errors = (403,)
class Sheldon(_BasicScraper): class Sheldon(_BasicScraper):
@ -324,6 +325,7 @@ class Sorcery101(_ParserScraper):
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[%s]' % xpath_class('last-webcomic-link') latestSearch = '//a[%s]' % xpath_class('last-webcomic-link')
starter = indirectStarter starter = indirectStarter
allow_errors = (500,)
help = 'Index format: stripname' help = 'Index format: stripname'

View file

@ -79,6 +79,12 @@ class Scraper(object):
# usually the index format help # usually the index format help
help = '' help = ''
# Specifing a list of HTTP error codes which should be handled as a
# successful request. This is a workaround for some comics which return
# regular pages with strange HTTP codes. By default, all HTTP errors raise
# exceptions.
allow_errors = ()
# HTTP session for configuration & cookies # HTTP session for configuration & cookies
session = requests_session() session = requests_session()
@ -306,7 +312,7 @@ class Scraper(object):
methods should be able to use the data if they so desire... (Affected methods should be able to use the data if they so desire... (Affected
methods: shouldSkipUrl, imageUrlModifier) methods: shouldSkipUrl, imageUrlModifier)
""" """
raise ValueError("No implementation for getPage!") return get_page(url, self.session, allow_errors=self.allow_errors)
def fetchUrls(self, url, data, urlsearch): def fetchUrls(self, url, data, urlsearch):
raise ValueError("No implementation for fetchUrls!") raise ValueError("No implementation for fetchUrls!")
@ -362,7 +368,7 @@ class _BasicScraper(Scraper):
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)')) BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
def getPage(self, url): def getPage(self, url):
content = get_page(url, self.session).text content = super(_BasicScraper, self).getPage(url).text
# determine base URL # determine base URL
baseUrl = None baseUrl = None
match = self.BASE_SEARCH.search(content) match = self.BASE_SEARCH.search(content)
@ -449,7 +455,7 @@ class _ParserScraper(Scraper):
broken_html_bugfix = False broken_html_bugfix = False
def getPage(self, url): def getPage(self, url):
page = get_page(url, self.session) page = super(_ParserScraper, self).getPage(url)
if page.encoding: if page.encoding:
# Requests figured out the encoding, so we can deliver Unicode to # Requests figured out the encoding, so we can deliver Unicode to
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML # LXML. Unfortunatly, LXML feels betrayed if there is still an XML

View file

@ -18,7 +18,9 @@ import re
import traceback import traceback
import time import time
import subprocess import subprocess
from six.moves.html_parser import HTMLParser from six.moves.html_parser import HTMLParser
from six.moves import range
import six import six
try: try:
@ -172,11 +174,11 @@ def case_insensitive_re(name):
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name) return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
def get_page(url, session, max_content_bytes=MaxContentBytes): def get_page(url, session, **kwargs):
"""Get text content of given URL.""" """Get text content of given URL."""
check_robotstxt(url, session) check_robotstxt(url, session)
# read page data # read page data
page = urlopen(url, session, max_content_bytes=max_content_bytes) page = urlopen(url, session, max_content_bytes=MaxContentBytes, **kwargs)
out.debug(u"Got page content %r" % page.content, level=3) out.debug(u"Got page content %r" % page.content, level=3)
return page return page
@ -257,7 +259,7 @@ def get_robotstxt_parser(url, session=None):
rp = RobotFileParser() rp = RobotFileParser()
try: try:
req = urlopen(url, session, max_content_bytes=MaxContentBytes, req = urlopen(url, session, max_content_bytes=MaxContentBytes,
raise_for_status=False) allow_errors=range(600))
except Exception: except Exception:
# connect or timeout errors are treated as an absent robots.txt # connect or timeout errors are treated as an absent robots.txt
rp.allow_all = True rp.allow_all = True
@ -270,7 +272,7 @@ def get_robotstxt_parser(url, session=None):
def urlopen(url, session, referrer=None, max_content_bytes=None, def urlopen(url, session, referrer=None, max_content_bytes=None,
raise_for_status=True, useragent=UserAgent, **kwargs): allow_errors=(), useragent=UserAgent, **kwargs):
"""Open an URL and return the response object.""" """Open an URL and return the response object."""
out.debug(u'Open URL %s' % url) out.debug(u'Open URL %s' % url)
if 'headers' not in kwargs: if 'headers' not in kwargs:
@ -291,7 +293,7 @@ def urlopen(url, session, referrer=None, max_content_bytes=None,
req = session.request(method, url, **kwargs) req = session.request(method, url, **kwargs)
out.debug(u'Response cookies: %s' % req.cookies) out.debug(u'Response cookies: %s' % req.cookies)
check_content_size(url, req.headers, max_content_bytes) check_content_size(url, req.headers, max_content_bytes)
if raise_for_status: if req.status_code not in allow_errors:
req.raise_for_status() req.raise_for_status()
return req return req
except requests.exceptions.RequestException as err: except requests.exceptions.RequestException as err: