More relaxed robots.txt handling.
This is in line with how Perl's LWP::RobotUA and Google handles server errors when fetching robots.txt: Just assume access is allowed. See https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
This commit is contained in:
parent
88e387ad15
commit
5affd8af68
1 changed files with 2 additions and 4 deletions
|
@ -282,12 +282,10 @@ def get_robotstxt_parser(url, session=None):
|
||||||
try:
|
try:
|
||||||
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
||||||
except Exception:
|
except Exception:
|
||||||
# connect or timeout errors are treated as an absent robotst.txt
|
# connect or timeout errors are treated as an absent robots.txt
|
||||||
rp.allow_all = True
|
rp.allow_all = True
|
||||||
else:
|
else:
|
||||||
if req.status_code in (401, 403):
|
if req.status_code >= 400:
|
||||||
rp.disallow_all = True
|
|
||||||
elif req.status_code >= 400:
|
|
||||||
rp.allow_all = True
|
rp.allow_all = True
|
||||||
elif req.status_code == 200:
|
elif req.status_code == 200:
|
||||||
rp.parse(req.text.splitlines())
|
rp.parse(req.text.splitlines())
|
||||||
|
|
Loading…
Reference in a new issue