Catch robots.txt errors.
This commit is contained in:
parent
b453c442c2
commit
d0c3492cc7
1 changed files with 12 additions and 6 deletions
|
@ -183,6 +183,7 @@ def normaliseURL(url):
|
||||||
"""Removes any leading empty segments to avoid breaking urllib2; also replaces
|
"""Removes any leading empty segments to avoid breaking urllib2; also replaces
|
||||||
HTML entities and character references.
|
HTML entities and character references.
|
||||||
"""
|
"""
|
||||||
|
# XXX does not work for python3
|
||||||
if isinstance(url, unicode):
|
if isinstance(url, unicode):
|
||||||
url = url.encode(UrlEncoding, 'ignore')
|
url = url.encode(UrlEncoding, 'ignore')
|
||||||
# XXX: brutal hack
|
# XXX: brutal hack
|
||||||
|
@ -221,13 +222,18 @@ def check_robotstxt(url, session):
|
||||||
def get_robotstxt_parser(url, session=None):
|
def get_robotstxt_parser(url, session=None):
|
||||||
"""Get a RobotFileParser for the given robots.txt URL."""
|
"""Get a RobotFileParser for the given robots.txt URL."""
|
||||||
rp = robotparser.RobotFileParser()
|
rp = robotparser.RobotFileParser()
|
||||||
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
try:
|
||||||
if req.status_code in (401, 403):
|
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
||||||
rp.disallow_all = True
|
except Exception:
|
||||||
elif req.status_code >= 400:
|
# connect or timeout errors are treated as an absent robotst.txt
|
||||||
rp.allow_all = True
|
rp.allow_all = True
|
||||||
elif req.status_code == 200:
|
else:
|
||||||
rp.parse(req.content.splitlines())
|
if req.status_code in (401, 403):
|
||||||
|
rp.disallow_all = True
|
||||||
|
elif req.status_code >= 400:
|
||||||
|
rp.allow_all = True
|
||||||
|
elif req.status_code == 200:
|
||||||
|
rp.parse(req.content.splitlines())
|
||||||
return rp
|
return rp
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue