From 5affd8af68b8a09d815815897ad8495732da9d57 Mon Sep 17 00:00:00 2001
From: Tobias Gruetzmacher <tobias-git@23.gs>
Date: Wed, 15 Jul 2015 19:11:55 +0200
Subject: [PATCH] More relaxed robots.txt handling.

This is in line with how Perl's LWP::RobotUA and Google handles server
errors when fetching robots.txt: Just assume access is allowed.

See https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
---
 dosagelib/util.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/dosagelib/util.py b/dosagelib/util.py
index f6f93bd10..0f54ca92f 100644
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@@ -282,12 +282,10 @@ def get_robotstxt_parser(url, session=None):
     try:
         req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
     except Exception:
-        # connect or timeout errors are treated as an absent robotst.txt
+        # connect or timeout errors are treated as an absent robots.txt
         rp.allow_all = True
     else:
-        if req.status_code in (401, 403):
-            rp.disallow_all = True
-        elif req.status_code >= 400:
+        if req.status_code >= 400:
             rp.allow_all = True
         elif req.status_code == 200:
             rp.parse(req.text.splitlines())