Allow modules to ignore some HTTP error codes.

This is neccessary since it seems some webservers out there are
misconfigured to deliver actual content with an HTTP error code...
This commit is contained in:
Tobias Gruetzmacher 2016-11-01 18:25:02 +01:00
parent 46b7a374f6
commit 3f9feec041
4 changed files with 19 additions and 8 deletions

View file

@ -245,6 +245,7 @@ class AppleGeeks(_BasicScraper):
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'((?:/comics/)?issue\d+\.jpg)'))
prevSearch = compile(r'<div class="caption">Previous Comic</div>\s*<p><a href="([^"]+)">', MULTILINE)
allow_errors = (404,)
help = 'Index format: n (unpadded)'

View file

@ -174,6 +174,7 @@ class SexyLosers(_ParserScraper):
class Sharksplode(_WordPressScraper):
url = 'http://sharksplode.com/'
textSearch = '//div[@id="comic"]//img/@alt'
allow_errors = (403,)
class Sheldon(_BasicScraper):
@ -324,6 +325,7 @@ class Sorcery101(_ParserScraper):
prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[%s]' % xpath_class('last-webcomic-link')
starter = indirectStarter
allow_errors = (500,)
help = 'Index format: stripname'

View file

@ -79,6 +79,12 @@ class Scraper(object):
# usually the index format help
help = ''
# Specifing a list of HTTP error codes which should be handled as a
# successful request. This is a workaround for some comics which return
# regular pages with strange HTTP codes. By default, all HTTP errors raise
# exceptions.
allow_errors = ()
# HTTP session for configuration & cookies
session = requests_session()
@ -306,7 +312,7 @@ class Scraper(object):
methods should be able to use the data if they so desire... (Affected
methods: shouldSkipUrl, imageUrlModifier)
"""
raise ValueError("No implementation for getPage!")
return get_page(url, self.session, allow_errors=self.allow_errors)
def fetchUrls(self, url, data, urlsearch):
raise ValueError("No implementation for fetchUrls!")
@ -362,7 +368,7 @@ class _BasicScraper(Scraper):
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
def getPage(self, url):
content = get_page(url, self.session).text
content = super(_BasicScraper, self).getPage(url).text
# determine base URL
baseUrl = None
match = self.BASE_SEARCH.search(content)
@ -449,7 +455,7 @@ class _ParserScraper(Scraper):
broken_html_bugfix = False
def getPage(self, url):
page = get_page(url, self.session)
page = super(_ParserScraper, self).getPage(url)
if page.encoding:
# Requests figured out the encoding, so we can deliver Unicode to
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML

View file

@ -18,7 +18,9 @@ import re
import traceback
import time
import subprocess
from six.moves.html_parser import HTMLParser
from six.moves import range
import six
try:
@ -172,11 +174,11 @@ def case_insensitive_re(name):
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
def get_page(url, session, max_content_bytes=MaxContentBytes):
def get_page(url, session, **kwargs):
"""Get text content of given URL."""
check_robotstxt(url, session)
# read page data
page = urlopen(url, session, max_content_bytes=max_content_bytes)
page = urlopen(url, session, max_content_bytes=MaxContentBytes, **kwargs)
out.debug(u"Got page content %r" % page.content, level=3)
return page
@ -257,7 +259,7 @@ def get_robotstxt_parser(url, session=None):
rp = RobotFileParser()
try:
req = urlopen(url, session, max_content_bytes=MaxContentBytes,
raise_for_status=False)
allow_errors=range(600))
except Exception:
# connect or timeout errors are treated as an absent robots.txt
rp.allow_all = True
@ -270,7 +272,7 @@ def get_robotstxt_parser(url, session=None):
def urlopen(url, session, referrer=None, max_content_bytes=None,
raise_for_status=True, useragent=UserAgent, **kwargs):
allow_errors=(), useragent=UserAgent, **kwargs):
"""Open an URL and return the response object."""
out.debug(u'Open URL %s' % url)
if 'headers' not in kwargs:
@ -291,7 +293,7 @@ def urlopen(url, session, referrer=None, max_content_bytes=None,
req = session.request(method, url, **kwargs)
out.debug(u'Response cookies: %s' % req.cookies)
check_content_size(url, req.headers, max_content_bytes)
if raise_for_status:
if req.status_code not in allow_errors:
req.raise_for_status()
return req
except requests.exceptions.RequestException as err: