More relaxed robots.txt handling.

This is in line with how Perl's LWP::RobotUA and Google handles server errors when fetching robots.txt: Just assume access is allowed. See https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
2015-07-15 19:11:55 +02:00 · 2015-07-15 19:11:55 +02:00 · 5affd8af68
commit 5affd8af68
parent 88e387ad15
1 changed files with 2 additions and 4 deletions
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -282,12 +282,10 @@ def get_robotstxt_parser(url, session=None):
    try:
        req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
    except Exception:
-        # connect or timeout errors are treated as an absent robotst.txt
+        # connect or timeout errors are treated as an absent robots.txt
        rp.allow_all = True
    else:
-        if req.status_code in (401, 403):
+        if req.status_code >= 400:
            rp.disallow_all = True
        elif req.status_code >= 400:
            rp.allow_all = True
        elif req.status_code == 200:
            rp.parse(req.text.splitlines())