Allow modules to ignore some HTTP error codes.
This is neccessary since it seems some webservers out there are misconfigured to deliver actual content with an HTTP error code...
This commit is contained in:
parent
46b7a374f6
commit
3f9feec041
4 changed files with 19 additions and 8 deletions
|
@ -245,6 +245,7 @@ class AppleGeeks(_BasicScraper):
|
|||
firstStripUrl = stripUrl % '1'
|
||||
imageSearch = compile(tagre("img", "src", r'((?:/comics/)?issue\d+\.jpg)'))
|
||||
prevSearch = compile(r'<div class="caption">Previous Comic</div>\s*<p><a href="([^"]+)">', MULTILINE)
|
||||
allow_errors = (404,)
|
||||
help = 'Index format: n (unpadded)'
|
||||
|
||||
|
||||
|
|
|
@ -174,6 +174,7 @@ class SexyLosers(_ParserScraper):
|
|||
class Sharksplode(_WordPressScraper):
|
||||
url = 'http://sharksplode.com/'
|
||||
textSearch = '//div[@id="comic"]//img/@alt'
|
||||
allow_errors = (403,)
|
||||
|
||||
|
||||
class Sheldon(_BasicScraper):
|
||||
|
@ -324,6 +325,7 @@ class Sorcery101(_ParserScraper):
|
|||
prevSearch = '//a[@rel="prev"]'
|
||||
latestSearch = '//a[%s]' % xpath_class('last-webcomic-link')
|
||||
starter = indirectStarter
|
||||
allow_errors = (500,)
|
||||
help = 'Index format: stripname'
|
||||
|
||||
|
||||
|
|
|
@ -79,6 +79,12 @@ class Scraper(object):
|
|||
# usually the index format help
|
||||
help = ''
|
||||
|
||||
# Specifing a list of HTTP error codes which should be handled as a
|
||||
# successful request. This is a workaround for some comics which return
|
||||
# regular pages with strange HTTP codes. By default, all HTTP errors raise
|
||||
# exceptions.
|
||||
allow_errors = ()
|
||||
|
||||
# HTTP session for configuration & cookies
|
||||
session = requests_session()
|
||||
|
||||
|
@ -306,7 +312,7 @@ class Scraper(object):
|
|||
methods should be able to use the data if they so desire... (Affected
|
||||
methods: shouldSkipUrl, imageUrlModifier)
|
||||
"""
|
||||
raise ValueError("No implementation for getPage!")
|
||||
return get_page(url, self.session, allow_errors=self.allow_errors)
|
||||
|
||||
def fetchUrls(self, url, data, urlsearch):
|
||||
raise ValueError("No implementation for fetchUrls!")
|
||||
|
@ -362,7 +368,7 @@ class _BasicScraper(Scraper):
|
|||
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
|
||||
|
||||
def getPage(self, url):
|
||||
content = get_page(url, self.session).text
|
||||
content = super(_BasicScraper, self).getPage(url).text
|
||||
# determine base URL
|
||||
baseUrl = None
|
||||
match = self.BASE_SEARCH.search(content)
|
||||
|
@ -449,7 +455,7 @@ class _ParserScraper(Scraper):
|
|||
broken_html_bugfix = False
|
||||
|
||||
def getPage(self, url):
|
||||
page = get_page(url, self.session)
|
||||
page = super(_ParserScraper, self).getPage(url)
|
||||
if page.encoding:
|
||||
# Requests figured out the encoding, so we can deliver Unicode to
|
||||
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML
|
||||
|
|
|
@ -18,7 +18,9 @@ import re
|
|||
import traceback
|
||||
import time
|
||||
import subprocess
|
||||
|
||||
from six.moves.html_parser import HTMLParser
|
||||
from six.moves import range
|
||||
import six
|
||||
|
||||
try:
|
||||
|
@ -172,11 +174,11 @@ def case_insensitive_re(name):
|
|||
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
||||
|
||||
|
||||
def get_page(url, session, max_content_bytes=MaxContentBytes):
|
||||
def get_page(url, session, **kwargs):
|
||||
"""Get text content of given URL."""
|
||||
check_robotstxt(url, session)
|
||||
# read page data
|
||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||
page = urlopen(url, session, max_content_bytes=MaxContentBytes, **kwargs)
|
||||
out.debug(u"Got page content %r" % page.content, level=3)
|
||||
return page
|
||||
|
||||
|
@ -257,7 +259,7 @@ def get_robotstxt_parser(url, session=None):
|
|||
rp = RobotFileParser()
|
||||
try:
|
||||
req = urlopen(url, session, max_content_bytes=MaxContentBytes,
|
||||
raise_for_status=False)
|
||||
allow_errors=range(600))
|
||||
except Exception:
|
||||
# connect or timeout errors are treated as an absent robots.txt
|
||||
rp.allow_all = True
|
||||
|
@ -270,7 +272,7 @@ def get_robotstxt_parser(url, session=None):
|
|||
|
||||
|
||||
def urlopen(url, session, referrer=None, max_content_bytes=None,
|
||||
raise_for_status=True, useragent=UserAgent, **kwargs):
|
||||
allow_errors=(), useragent=UserAgent, **kwargs):
|
||||
"""Open an URL and return the response object."""
|
||||
out.debug(u'Open URL %s' % url)
|
||||
if 'headers' not in kwargs:
|
||||
|
@ -291,7 +293,7 @@ def urlopen(url, session, referrer=None, max_content_bytes=None,
|
|||
req = session.request(method, url, **kwargs)
|
||||
out.debug(u'Response cookies: %s' % req.cookies)
|
||||
check_content_size(url, req.headers, max_content_bytes)
|
||||
if raise_for_status:
|
||||
if req.status_code not in allow_errors:
|
||||
req.raise_for_status()
|
||||
return req
|
||||
except requests.exceptions.RequestException as err:
|
||||
|
|
Loading…
Reference in a new issue