Allow modules to ignore some HTTP error codes.
This is neccessary since it seems some webservers out there are misconfigured to deliver actual content with an HTTP error code...
This commit is contained in:
parent
46b7a374f6
commit
3f9feec041
4 changed files with 19 additions and 8 deletions
|
@ -245,6 +245,7 @@ class AppleGeeks(_BasicScraper):
|
||||||
firstStripUrl = stripUrl % '1'
|
firstStripUrl = stripUrl % '1'
|
||||||
imageSearch = compile(tagre("img", "src", r'((?:/comics/)?issue\d+\.jpg)'))
|
imageSearch = compile(tagre("img", "src", r'((?:/comics/)?issue\d+\.jpg)'))
|
||||||
prevSearch = compile(r'<div class="caption">Previous Comic</div>\s*<p><a href="([^"]+)">', MULTILINE)
|
prevSearch = compile(r'<div class="caption">Previous Comic</div>\s*<p><a href="([^"]+)">', MULTILINE)
|
||||||
|
allow_errors = (404,)
|
||||||
help = 'Index format: n (unpadded)'
|
help = 'Index format: n (unpadded)'
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -174,6 +174,7 @@ class SexyLosers(_ParserScraper):
|
||||||
class Sharksplode(_WordPressScraper):
|
class Sharksplode(_WordPressScraper):
|
||||||
url = 'http://sharksplode.com/'
|
url = 'http://sharksplode.com/'
|
||||||
textSearch = '//div[@id="comic"]//img/@alt'
|
textSearch = '//div[@id="comic"]//img/@alt'
|
||||||
|
allow_errors = (403,)
|
||||||
|
|
||||||
|
|
||||||
class Sheldon(_BasicScraper):
|
class Sheldon(_BasicScraper):
|
||||||
|
@ -324,6 +325,7 @@ class Sorcery101(_ParserScraper):
|
||||||
prevSearch = '//a[@rel="prev"]'
|
prevSearch = '//a[@rel="prev"]'
|
||||||
latestSearch = '//a[%s]' % xpath_class('last-webcomic-link')
|
latestSearch = '//a[%s]' % xpath_class('last-webcomic-link')
|
||||||
starter = indirectStarter
|
starter = indirectStarter
|
||||||
|
allow_errors = (500,)
|
||||||
help = 'Index format: stripname'
|
help = 'Index format: stripname'
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -79,6 +79,12 @@ class Scraper(object):
|
||||||
# usually the index format help
|
# usually the index format help
|
||||||
help = ''
|
help = ''
|
||||||
|
|
||||||
|
# Specifing a list of HTTP error codes which should be handled as a
|
||||||
|
# successful request. This is a workaround for some comics which return
|
||||||
|
# regular pages with strange HTTP codes. By default, all HTTP errors raise
|
||||||
|
# exceptions.
|
||||||
|
allow_errors = ()
|
||||||
|
|
||||||
# HTTP session for configuration & cookies
|
# HTTP session for configuration & cookies
|
||||||
session = requests_session()
|
session = requests_session()
|
||||||
|
|
||||||
|
@ -306,7 +312,7 @@ class Scraper(object):
|
||||||
methods should be able to use the data if they so desire... (Affected
|
methods should be able to use the data if they so desire... (Affected
|
||||||
methods: shouldSkipUrl, imageUrlModifier)
|
methods: shouldSkipUrl, imageUrlModifier)
|
||||||
"""
|
"""
|
||||||
raise ValueError("No implementation for getPage!")
|
return get_page(url, self.session, allow_errors=self.allow_errors)
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlsearch):
|
def fetchUrls(self, url, data, urlsearch):
|
||||||
raise ValueError("No implementation for fetchUrls!")
|
raise ValueError("No implementation for fetchUrls!")
|
||||||
|
@ -362,7 +368,7 @@ class _BasicScraper(Scraper):
|
||||||
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
|
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
|
||||||
|
|
||||||
def getPage(self, url):
|
def getPage(self, url):
|
||||||
content = get_page(url, self.session).text
|
content = super(_BasicScraper, self).getPage(url).text
|
||||||
# determine base URL
|
# determine base URL
|
||||||
baseUrl = None
|
baseUrl = None
|
||||||
match = self.BASE_SEARCH.search(content)
|
match = self.BASE_SEARCH.search(content)
|
||||||
|
@ -449,7 +455,7 @@ class _ParserScraper(Scraper):
|
||||||
broken_html_bugfix = False
|
broken_html_bugfix = False
|
||||||
|
|
||||||
def getPage(self, url):
|
def getPage(self, url):
|
||||||
page = get_page(url, self.session)
|
page = super(_ParserScraper, self).getPage(url)
|
||||||
if page.encoding:
|
if page.encoding:
|
||||||
# Requests figured out the encoding, so we can deliver Unicode to
|
# Requests figured out the encoding, so we can deliver Unicode to
|
||||||
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML
|
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML
|
||||||
|
|
|
@ -18,7 +18,9 @@ import re
|
||||||
import traceback
|
import traceback
|
||||||
import time
|
import time
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from six.moves.html_parser import HTMLParser
|
from six.moves.html_parser import HTMLParser
|
||||||
|
from six.moves import range
|
||||||
import six
|
import six
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -172,11 +174,11 @@ def case_insensitive_re(name):
|
||||||
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
||||||
|
|
||||||
|
|
||||||
def get_page(url, session, max_content_bytes=MaxContentBytes):
|
def get_page(url, session, **kwargs):
|
||||||
"""Get text content of given URL."""
|
"""Get text content of given URL."""
|
||||||
check_robotstxt(url, session)
|
check_robotstxt(url, session)
|
||||||
# read page data
|
# read page data
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
page = urlopen(url, session, max_content_bytes=MaxContentBytes, **kwargs)
|
||||||
out.debug(u"Got page content %r" % page.content, level=3)
|
out.debug(u"Got page content %r" % page.content, level=3)
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
@ -257,7 +259,7 @@ def get_robotstxt_parser(url, session=None):
|
||||||
rp = RobotFileParser()
|
rp = RobotFileParser()
|
||||||
try:
|
try:
|
||||||
req = urlopen(url, session, max_content_bytes=MaxContentBytes,
|
req = urlopen(url, session, max_content_bytes=MaxContentBytes,
|
||||||
raise_for_status=False)
|
allow_errors=range(600))
|
||||||
except Exception:
|
except Exception:
|
||||||
# connect or timeout errors are treated as an absent robots.txt
|
# connect or timeout errors are treated as an absent robots.txt
|
||||||
rp.allow_all = True
|
rp.allow_all = True
|
||||||
|
@ -270,7 +272,7 @@ def get_robotstxt_parser(url, session=None):
|
||||||
|
|
||||||
|
|
||||||
def urlopen(url, session, referrer=None, max_content_bytes=None,
|
def urlopen(url, session, referrer=None, max_content_bytes=None,
|
||||||
raise_for_status=True, useragent=UserAgent, **kwargs):
|
allow_errors=(), useragent=UserAgent, **kwargs):
|
||||||
"""Open an URL and return the response object."""
|
"""Open an URL and return the response object."""
|
||||||
out.debug(u'Open URL %s' % url)
|
out.debug(u'Open URL %s' % url)
|
||||||
if 'headers' not in kwargs:
|
if 'headers' not in kwargs:
|
||||||
|
@ -291,7 +293,7 @@ def urlopen(url, session, referrer=None, max_content_bytes=None,
|
||||||
req = session.request(method, url, **kwargs)
|
req = session.request(method, url, **kwargs)
|
||||||
out.debug(u'Response cookies: %s' % req.cookies)
|
out.debug(u'Response cookies: %s' % req.cookies)
|
||||||
check_content_size(url, req.headers, max_content_bytes)
|
check_content_size(url, req.headers, max_content_bytes)
|
||||||
if raise_for_status:
|
if req.status_code not in allow_errors:
|
||||||
req.raise_for_status()
|
req.raise_for_status()
|
||||||
return req
|
return req
|
||||||
except requests.exceptions.RequestException as err:
|
except requests.exceptions.RequestException as err:
|
||||||
|
|
Loading…
Reference in a new issue