Catch robots.txt errors.

This commit is contained in:
Bastian Kleineidam 2013-02-21 19:48:04 +01:00
parent b453c442c2
commit d0c3492cc7

View file

@ -183,6 +183,7 @@ def normaliseURL(url):
"""Removes any leading empty segments to avoid breaking urllib2; also replaces """Removes any leading empty segments to avoid breaking urllib2; also replaces
HTML entities and character references. HTML entities and character references.
""" """
# XXX does not work for python3
if isinstance(url, unicode): if isinstance(url, unicode):
url = url.encode(UrlEncoding, 'ignore') url = url.encode(UrlEncoding, 'ignore')
# XXX: brutal hack # XXX: brutal hack
@ -221,7 +222,12 @@ def check_robotstxt(url, session):
def get_robotstxt_parser(url, session=None): def get_robotstxt_parser(url, session=None):
"""Get a RobotFileParser for the given robots.txt URL.""" """Get a RobotFileParser for the given robots.txt URL."""
rp = robotparser.RobotFileParser() rp = robotparser.RobotFileParser()
try:
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False) req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
except Exception:
# connect or timeout errors are treated as an absent robotst.txt
rp.allow_all = True
else:
if req.status_code in (401, 403): if req.status_code in (401, 403):
rp.disallow_all = True rp.disallow_all = True
elif req.status_code >= 400: elif req.status_code >= 400: