Catch robots.txt errors.

This commit is contained in:
Bastian Kleineidam 2013-02-21 19:48:04 +01:00
parent b453c442c2
commit d0c3492cc7

View file

@ -183,6 +183,7 @@ def normaliseURL(url):
"""Removes any leading empty segments to avoid breaking urllib2; also replaces """Removes any leading empty segments to avoid breaking urllib2; also replaces
HTML entities and character references. HTML entities and character references.
""" """
# XXX does not work for python3
if isinstance(url, unicode): if isinstance(url, unicode):
url = url.encode(UrlEncoding, 'ignore') url = url.encode(UrlEncoding, 'ignore')
# XXX: brutal hack # XXX: brutal hack
@ -221,13 +222,18 @@ def check_robotstxt(url, session):
def get_robotstxt_parser(url, session=None): def get_robotstxt_parser(url, session=None):
"""Get a RobotFileParser for the given robots.txt URL.""" """Get a RobotFileParser for the given robots.txt URL."""
rp = robotparser.RobotFileParser() rp = robotparser.RobotFileParser()
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False) try:
if req.status_code in (401, 403): req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
rp.disallow_all = True except Exception:
elif req.status_code >= 400: # connect or timeout errors are treated as an absent robotst.txt
rp.allow_all = True rp.allow_all = True
elif req.status_code == 200: else:
rp.parse(req.content.splitlines()) if req.status_code in (401, 403):
rp.disallow_all = True
elif req.status_code >= 400:
rp.allow_all = True
elif req.status_code == 200:
rp.parse(req.content.splitlines())
return rp return rp